library(rdhs) # for importing data from the API
library(tidyverse) # for data manipulation
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(haven) # for handling labelled columns
library(sf) # for handling shapefiles
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(gstat) # for mapping and krigging
library(sp) # for coordinate transformations
library(spdep) # for getting area data weights
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(geojsonsf) # for converting shapefile into sp
library(ggvoronoi) # for tiling point data into area data
library(spatialreg) # for spatial regression
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Attaching package: 'spatialreg'
##
## The following objects are masked from 'package:spdep':
##
## get.ClusterOption, get.coresOption, get.mcOption,
## get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
## set.coresOption, set.mcOption, set.VerboseOption,
## set.ZeroPolicyOption
library(GWmodel) # for geographically weighted regression
## Loading required package: maptools
## Checking rgeos availability: TRUE
## Please note that 'maptools' will be retired during 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
## Loading required package: robustbase
## Loading required package: Rcpp
## Welcome to GWmodel version 2.2-9.
library(caret) # decision tree with cross validation
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart) # decision tree without cross validation
library(rpart.plot) # for plotting dtree results
library(Metrics) # for testing models
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
# manually read in the stata file
sl19 <- read_dta('./data/SLHR7ADT/SLHR7AFL.DTA')
var_labels <- get_variable_labels(sl19)
# read in shapefiles of the country, regions, and survey clusters
# https://gis.stackexchange.com/questions/19064/opening-shapefile-in-r
# rad in the shapefile of the country
country <- read_sf(dsn = "./data/sdr_national_data_2023-04-17/shps", layer = "sdr_national_data_dhs_2019")
plot(country)
## Warning: plotting the first 9 out of 37 attributes; use max.plot = 37 to plot
## all
# read in the shapefile of regions
regions <- read_sf(dsn = "./data/sdr_subnational_boundaries_2023-04-17/shps", layer = "sdr_subnational_boundaries2")
plot(regions)
## Warning: plotting the first 9 out of 27 attributes; use max.plot = 27 to plot
## all
# read in shapefile of survey cluster coordinates
clusters <- read_sf(dsn = "./data/SLGE7AFL", layer = "SLGE7AFL")
plot(clusters)
## Warning: plotting the first 9 out of 20 attributes; use max.plot = 20 to plot
## all
# "hv001" in sl19 dataframe tells us which cluster number each household belongs to
# "DHSCLUST" in gps dataframe contains the cluster number
# "LATNUM" and "LONGNUM" contain the coordinates of each cluster
# we'll need to join these two dfs to create maps of the variables we're interested in
# https://sparkbyexamples.com/r-programming/how-to-do-left-join-in-r/
sl19gps <- merge(
x=sl19,
y=clusters,
by.x="hv001",
by.y="DHSCLUST",
all.x=TRUE)
# export useful data to exports folder
saveRDS(country, "./exports/country.rds")
saveRDS(regions, "./exports/regions.rds")
saveRDS(clusters, "./exports/clusters.rds")
saveRDS(var_labels, "./exports/var_labels.rds")
saveRDS(sl19, "./exports/sl19.rds")
saveRDS(sl19gps, "./exports/sl19gps.rds")
sl19gps <- readRDS("./exports/sl19gps.rds")
df <- sl19gps
# columns of interest
interestingColumns <- c("hv000","hv001","hv006","hv007","hv010","hv011","hv012","hv013","hv014","hv024","hv025","hv040","hv045c","hv201","hv204","hv205","hv206","hv207","hv208","hv209","hv210","hv211","hv212","hv213","hv214","hv215","hv216","hv217","hv219","hv220","hv221","hv226","hv227","hv230a","hv237","hv241","hv243a","hv243b","hv243c","hv243d","hv243e","hv244","hv245","hv246","hv246a","hv246b","hv246c","hv246d","hv246e","hv246f","hv247","hv270","hv271","hv270a","hv271a","hml1")
gpsColumns <- c("LATNUM", "LONGNUM", "geometry", "ALT_DEM", "DATUM", "DHSREGNA", "ADM1NAME", "DHSID")
keepColumns <- c(interestingColumns, gpsColumns)
# filter down columns
df <- df[,keepColumns]
# filter out 0 coordinates
df <- filter(df, LATNUM != 0)
# re-export
sl19clean <- df
saveRDS(sl19clean, "./exports/sl19clean.rds")
# Dropping all data with coordinates (0,0)
plottable_df <- filter(sl19gps, LATNUM > 0)
# This is turning hv206 from"haven-labelled' class to numeric for compatability with geom_point function.
plottable_df$hv206 <- as.factor(plottable_df$hv206)
ggplot(data = regions) +
geom_sf() +
geom_point(data=plottable_df, aes(x=LONGNUM, y=LATNUM, color=hv206, alpha=0.3)) +
scale_color_manual(values = c("1" = "yellow", "0" = "white"))+
labs(color="Access to Electricity")+
ggtitle('Household Access to Electricity in Sierra Leone') +
theme_minimal()+
ggspatial::annotation_scale()+
ggspatial::annotation_north_arrow(location = "tr", which_north = "true") +
labs(y= "", x = "") +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
rect = element_blank())
theme(plot.title = element_text(size = 20))
## List of 1
## $ plot.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 20
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
# We have a bit of a problem in that each cluster represents several families, so most likely we are just seeing one family per cluster which isn't necessarily representative. Perhaps a spatial interpolation method map would give a better overall impression of electricity availability.
plottable_df$hv271 <- as.numeric(plottable_df$hv271)
ggplot(data = regions) +
geom_sf() +
geom_point(data=plottable_df, aes(x=LONGNUM, y=LATNUM, color=(hv271))) +
scale_color_gradient(low="white",
high="darkgreen", space ="Lab")+
labs(color="Wealth Level")+
ggtitle('Household Wealth Distribution in Sierra Leone') +
theme_minimal()+
ggspatial::annotation_scale()+
ggspatial::annotation_north_arrow(location = "tr", which_north = "true") +
labs(y= "", x = "") +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
rect = element_blank())
theme(plot.title = element_text(size = 20))
## List of 1
## $ plot.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 20
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
# import data
sl19keep <- readRDS("./exports/sl19clean.rds")
Histograms
# histograms
hist(sl19keep$hv040, main="Altitude (m)") # altitude in meters
plot(sl19keep$hv024, main="Region") # region
plot(sl19keep$hv045c, main="Native Language")
ilg_hist <- function(x, bins, label, fill) {
return(ggplot() +
geom_histogram(aes(x), color="black", fill=fill, bins=bins) +
xlab(label) +
ylab("Count") +
ggtitle(label) +
theme_bw()
)
}
household <- ilg_hist(sl19keep$hv012, 33, "# Household Members", "purple")
household
women <- ilg_hist(sl19keep$hv010, 11, "# Women aged 15-49", "purple")
women
men <- ilg_hist(sl19keep$hv011, 11, "# Men aged 15-49", "purple")
men
children <- ilg_hist(sl19keep$hv014, 10, "# Children under 5 y.o.", "purple")
children
altitude <- ilg_hist(sl19keep$hv040, 30, "Altitude", "blue")
altitude
timetowater <- ilg_hist(sl19keep$hv204, 30, "Time to Water Source", "darkgreen")
timetowater # filter out high "unknowns"
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.
Bar Charts
ilg_bar <- function(x, label, fill) {
return(ggplot() +
geom_bar(aes(x), color="black", fill=fill) +
xlab(label) +
ylab("Count") +
ggtitle(label) +
theme_bw()
)
}
region <- ilg_bar(sl19keep$hv024, "Region", "blue")
region
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.
language <- ilg_bar(sl19keep$hv045c, "Language", "blue")
language
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.
watersource <- ilg_bar(sl19keep$hv201, "Source of Drinking Water", "darkgreen")
watersource
## Don't know how to automatically pick scale for object of type
## <haven_labelled/vctrs_vctr/double>. Defaulting to continuous.
Pie Charts
# pie charts of asset ownership
elec <- sum(sl19keep$hv206 == 1)
radio <- sum(sl19keep$hv207 == 1)
tele <- sum(sl19keep$hv208 == 1)
fridge <- sum(sl19keep$hv209 == 1)
bicycle <- sum(sl19keep$hv210 == 1)
motorcycle <- sum(sl19keep$hv211 == 1)
car <- sum(sl19keep$hv212 == 1)
n <- nrow(sl19keep)
pie(c(elec, n-elec), labels = c("Yes", "No"), main="Have Electricity", col=c("purple","white"))
pie(c(radio, n-radio), labels = c("Yes", "No"), main="Have Radio", col=c("purple","white"))
pie(c(tele, n-tele), labels = c("Yes", "No"), main="Have Television", col=c("purple","white"))
pie(c(fridge, n-fridge), labels = c("Yes", "No"), main="Have Fridge", col=c("purple","white"))
pie(c(bicycle, n-bicycle), labels = c("Yes", "No"), main="Have Bicycle", col=c("purple","white"))
pie(c(motorcycle, n-motorcycle), labels = c("Yes", "No"), main="Have Motorcycle", col=c("purple","white"))
pie(c(car, n-car), labels = c("Yes", "No"), main="Have Car", col=c("purple","white"))
Multivariable Bar Chart
# bar chart of asset ownership
counts <- c(elec,radio,tele,fridge,bicycle,motorcycle,car)
labels <- c("Electricity", "Radio", "Television", "Fridge", "Bicycle", "Motorcyle", "Car")
assets <- data.frame(asset=labels, count=counts)
ggplot(assets) +
geom_bar(aes(y=count, x=asset), fill="purple", stat='identity') +
ggtitle('Asset Ownership')+
xlab('Asset') +
ylab('Count') +
ylim(0,15000) +
theme_bw() +
scale_color_brewer(palette="Dark2")+
geom_hline(yintercept=n, style="dashed", color="gray")
## Warning in geom_hline(yintercept = n, style = "dashed", color = "gray"):
## Ignoring unknown parameters: `style`
# In this block I am projecting the cluster points to a WGS84 projection for analysis
sl19clean <- readRDS("./exports/sl19clean.rds")
df <- sl19clean
# project the cluster points
coordinates(df) = c('LONGNUM', 'LATNUM')
proj4string(df) = CRS("+proj=longlat +datum=WGS84")
plottable_df_projected<- spTransform(df, CRS("+proj=aea +lat_1=34 +lat_2=40.5 +lat_0=0 +lon_0=-120 +x_0=0 +y_0=-4000000 +ellps=WGS84 +datum=WGS84 +units=m +no_defs"))
saveRDS(plottable_df_projected, "./exports/plottable_df_projected.rds")
# In this block I am projecting the regions polygons to a WGS84 projection for analysis
regions <- readRDS("./exports/regions.rds")
# project the region boundaries
regions_oultine <-as(regions, "Spatial")
proj4string(regions_oultine) = CRS("+proj=longlat +datum=WGS84")
regions_projected<- spTransform(regions_oultine, CRS("+proj=aea +lat_1=34 +lat_2=40.5 +lat_0=0 +lon_0=-120 +x_0=0 +y_0=-4000000 +ellps=WGS84 +datum=WGS84 +units=m +no_defs"))
# In this next chunk I am making the grid for IDW and Kriging
# Making grid
kriging_grid <- SpatialPixels(SpatialPoints(makegrid(regions_projected, n=15000)), proj4string = proj4string(regions_projected))
# Applying grid to Sierra Leone
kriging_grid <- kriging_grid[regions_projected,]
plot(kriging_grid)
saveRDS(kriging_grid, "./exports/kriging_grid.rds")
saveRDS(regions_projected, "./exports/regions_projected.rds")
plottable_df_projected <- readRDS("./exports/plottable_df_projected.rds")
df <- plottable_df_projected
kriging_grid <- readRDS("./exports/kriging_grid.rds")
# Inverse distance weighting
hv271.idw <- idw(formula = hv271 ~ 1, df, kriging_grid, idp = 2, nmax = 100)
## [inverse distance weighted interpolation]
plot(hv271.idw)
#Plotting variogram for sample data
# sample variogram based on Cressie method, it is often used to account for outliers
p.vgm <- variogram(hv271 ~ 1, df, cressie=T)
plot(p.vgm, type='l')
# Reference block to see variogram model options
vgm()
## short long
## 1 Nug Nug (nugget)
## 2 Exp Exp (exponential)
## 3 Sph Sph (spherical)
## 4 Gau Gau (gaussian)
## 5 Exc Exclass (Exponential class/stable)
## 6 Mat Mat (Matern)
## 7 Ste Mat (Matern, M. Stein's parameterization)
## 8 Cir Cir (circular)
## 9 Lin Lin (linear)
## 10 Bes Bes (bessel)
## 11 Pen Pen (pentaspherical)
## 12 Per Per (periodic)
## 13 Wav Wav (wave)
## 14 Hol Hol (hole)
## 15 Log Log (logarithmic)
## 16 Pow Pow (power)
## 17 Spl Spl (spline)
## 18 Leg Leg (Legendre)
## 19 Err Err (Measurement error)
## 20 Int Int (Intercept)
show.vgms()
# Initial estimates for nugget, sill, and range (4,300,000,000; 7,900,000,000; 60,000)
saveRDS(hv271.idw, "./exports/hv271_krig_plot.rds")
# Fitting sample variogram to a model
initModel = vgm(psill = 7900000000, "Sph", range = 60000, nugget = 4300000000)
p.fitSph <- fit.variogram(p.vgm, model = initModel)
plot(p.vgm, pch = 20, cex = 1.5, col = "black", ylab = "Semivariance", xlab = "Distance (m)", model = p.fitSph)
Model = vgm(psill = 7900000000, "Exp", range = 60000, nugget = 4300000000)
p.fitExp <- fit.variogram(p.vgm, model = Model)
plot(p.vgm, pch = 20, cex = 1.5, col = "black", ylab = "Semivariance", xlab = "Distance (m)", model = p.fitExp)
Model2 = vgm(psill = 7900000000, "Lin", range = 60000, nugget = 4300000000)
p.fitMat <- fit.variogram(p.vgm, model = Model2)
plot(p.vgm, pch = 20, cex = 1.5, col = "black", ylab = "Semivariance", xlab = "Distance (m)", model = p.fitMat)
# After trying a few variograms, it appears Spherical is the best model
# Kriging model
# This line below is to remove duplicate locations as described here: https://gis.stackexchange.com/questions/222192/r-gstat-krige-covariance-matrix-singular-at-location-5-88-47-4-0-skipping
df <- df[-zerodist(df)[,1],]
krigSph <- krige(hv271 ~ 1, df, kriging_grid, model = p.fitSph)
## [using ordinary kriging]
plot(krigSph)
saveRDS(krigSph, "./exports/kriging_model1.rds")
# Evaluating accuracy with Cross Validation - commented out to run faster, but can be uncommented and run to evaluate
# cvKriging<-krige.cv(hv271~1, df, kriging_grid, model=p.fitSph)
# # get the RMSE
# rmse<-sqrt(mean(cvKriging$residual^2))
# rmse
#
cvKriging<-krige.cv(hv271~1, df, kriging_grid, model=p.fitExp)
# get the RMSE
rmse2<-sqrt(mean(cvKriging$residual^2))
rmse2
## [1] 63120.57
# It turns ou the p.fitExp fits better than the p.fitSph so I am now going to make a kriging with p.fitExp
krigExp <- krige(hv271 ~ 1, df, kriging_grid, model = p.fitExp)
## [using ordinary kriging]
plot(krigExp, main="Spatial Inter")
saveRDS(krigSph, "./exports/better_kriging_model.rds")
# reread in dataframe
sl19clean <- readRDS("./exports/sl19clean.rds")
df <- sl19clean
# define columns to keep
assetColumns <- c("hv206","hv207","hv208","hv209","hv210","hv211","hv212","hv221","hv227","hv243a","hv243b","hv243c","hv243d","hv243e","hv246a","hv246b","hv246c","hv246d","hv246e","hv246f","hv247", "hv271")
gpsColumns <- c("LATNUM", "LONGNUM", "ALT_DEM")
srColumns <- c(assetColumns, gpsColumns)
# filter big data frame to just columns needed for this
df <- df[,srColumns]
# convert all columns to numeric
df <- as.data.frame(lapply(df, as.numeric))
# add geometry back in
df$geometry <- sl19clean$geometry
# filter out "unknown" or "missing" values from survey
df <- na.omit(df) # remove NA values
df <- df %>% filter(
hv246a < 95 &
hv246b < 95 &
hv246c < 95 &
hv246d < 95 &
hv246e < 95 &
hv246f < 95
)
# remove geometry again
geometry <- df$geometry
df = select(df, -geometry)
# aggregate data to cluster level, keep means of other variables
df <- aggregate.data.frame(df, list(lat = df$LATNUM, lon = df$LONGNUM), mean)
sl19clusters <- df
# read in country and regions
regions <- readRDS("./exports/regions.rds")
country <- readRDS("./exports/country.rds")
# remove the islands from the polygon list
country_multi <- as.data.frame(st_cast(country$geometry, "POLYGON"))
country_multi$area_sqm <- st_area(country_multi$geometry)
sl.sf <- country_multi[1,"geometry"]
# convert the countries shapefile (sf) into SpatialPolygon (sp)
country.sp <- as(sl.sf, 'Spatial')
# create voronoi tesselation plot
ggplot(df) +
ggvoronoi::geom_voronoi(aes(lon, lat, fill=hv271), outline = country.sp) +
scale_fill_gradient(low="white", high="blue") +
geom_sf(data=regions, fill=NA, color="black") +
ggtitle("Voronoi Tesellation of Sierra Leone")
## Warning: GEOS support is provided by the sf and terra packages among others
## Warning: GEOS support is provided by the sf and terra packages among others
## Warning in gBuffer(outline_spdf, byid = TRUE, width = 0): Spatial object is not
## projected; GEOS expects planar coordinates
# create voronoi tesselation SpatialPolygon
areas.sp <- ggvoronoi::voronoi_polygon(df, x="lon", y="lat", outline = country.sp)
## Warning: GEOS support is provided by the sf and terra packages among others
## Warning: GEOS support is provided by the sf and terra packages among others
## Warning in gBuffer(outline_spdf, byid = TRUE, width = 0): Spatial object is not
## projected; GEOS expects planar coordinates
# convert voronoi SpatialPolygon to a shapefile
areas.sf <- sf::st_as_sf(areas.sp)
# # remove areas with multiple polygons
# areas.sf$n_polygons <- lapply(areas.sf$geometry, length)
# areas.sf <- filter(areas.sf, n_polygons == 1)
# convert list of polygons into individual polygons
# geometry <- areas.sf$geometry
# geometry2 <- st_cast(geometry, "POLYGON")
# areas.sf$geometry <- geometry2
# areas.sf$geometry3 <- lapply(areas.sf$geometry, unlist)
# plot results
plot(areas.sf)
## Warning: plotting the first 9 out of 27 attributes; use max.plot = 27 to plot
## all
ggplot(data=areas.sf) +
geom_sf(aes(fill=hv271)) +
scale_fill_gradient(low="white", high="darkgreen") +
ggtitle("Mean Wealth Index across Sierra Leone ")
ggplot(data=areas.sf) +
geom_sf(aes(fill=hv206*100)) +
scale_fill_gradient(low="white", high="gold") +
ggtitle("% with Electricity Access in Sierra Leone")
# map just areas and points
ggplot() +
geom_sf(data=areas.sf) +
geom_point(data=plottable_df, aes(x=LONGNUM, y=LATNUM), cex=0.5) +
ggtitle("Thiessen Polygons of DHS 2019 Clusters in Sierra Leone")
cor(areas.sf$hv206, areas.sf$hv271, method = "pearson")
## [1] 0.8845982
# export area
saveRDS(areas.sf, "./exports/areas.rds")
saveRDS(sl19clusters, "./exports/sl19clusters.rds")
# import dataframe with shape file
sl19areas <- readRDS("./exports/areas.rds")
# keep just asset columns
assetColumns <- c("hv206","hv207","hv208","hv209","hv210","hv211","hv212","hv221","hv227","hv243a","hv243b","hv243c","hv243d","hv243e","hv246a","hv246b","hv246c","hv246d","hv246e","hv246f","hv247", "hv271")
sl19num <- sl19areas[,assetColumns]
# remove rows that have no neighbors
df <- df[-c(47, 68), ]
# save down dataframe
saveRDS(sl19num, "./exports/sl19num.rds")
# read in numeric dataframe
sl19num <- readRDS("./exports/sl19num.rds")
df <- sl19num
# process df for lm
geometry <- df$geometry
df <- df %>% st_drop_geometry()
# do with no variables
lm.mean <- lm(hv271 ~ 1, data=df)
summary(lm.mean)
##
## Call:
## lm(formula = hv271 ~ 1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -110194 -70606 -42934 67121 229770
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1148 3781 0.304 0.762
##
## Residual standard error: 88920 on 552 degrees of freedom
# mean = 623
rmse.lm.mean <- sqrt(mean(residuals(lm.mean)^2))
rmse.lm.mean # 88899.5
## [1] 88841.1
# map just areas and points
ggplot() +
geom_sf(data=areas.sf, fill="darkgreen", color="black", alpha=0.5) +
ggtitle("Null Model of Wealth Index across Sierra Leone") +
theme_void()
# basic linear model
lm.basic <- lm(hv271 ~ ., data=df)
summary(lm.basic)
##
## Call:
## lm(formula = hv271 ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72590 -13545 -17 12458 78689
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -85372.6 4999.4 -17.076 < 2e-16 ***
## hv206 40871.1 12853.5 3.180 0.001560 **
## hv207 36356.4 6995.2 5.197 2.89e-07 ***
## hv208 56968.7 18013.3 3.163 0.001653 **
## hv209 24710.2 17751.9 1.392 0.164514
## hv210 46779.4 15788.9 2.963 0.003185 **
## hv211 208.9 11527.8 0.018 0.985549
## hv212 138908.1 24755.0 5.611 3.24e-08 ***
## hv221 79367.9 63907.3 1.242 0.214814
## hv227 -37549.5 5276.9 -7.116 3.62e-12 ***
## hv243a 99366.5 6433.6 15.445 < 2e-16 ***
## hv243b 7346.1 6346.4 1.158 0.247577
## hv243c -171181.0 104039.6 -1.645 0.100490
## hv243d -57250.1 17030.5 -3.362 0.000831 ***
## hv243e 17504.0 24117.0 0.726 0.468283
## hv246a 610.1 3096.3 0.197 0.843867
## hv246b 516.6 1112.0 0.465 0.642401
## hv246c 34105.3 13185.4 2.587 0.009958 **
## hv246d -11901.4 1877.1 -6.340 4.91e-10 ***
## hv246e 3365.4 2950.5 1.141 0.254550
## hv246f -2818.3 452.5 -6.228 9.60e-10 ***
## hv247 71281.3 10129.8 7.037 6.10e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20480 on 531 degrees of freedom
## Multiple R-squared: 0.949, Adjusted R-squared: 0.947
## F-statistic: 470.4 on 21 and 531 DF, p-value: < 2.2e-16
rmse.lm.basic <- sqrt(mean(residuals(lm.basic)^2))
rmse.lm.basic # 19785.72
## [1] 20066.05
# adjusted R-squared is 0.9484
# r-squared is 0.9505
# define neighbors & weights
queen_neighbors <- spdep::poly2nb(geometry)
queen_weights <- spdep::nb2listw(queen_neighbors, zero.policy = TRUE)
spdep::lm.morantest(lm.basic, queen_weights)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = hv271 ~ ., data = df)
## weights: queen_weights
##
## Moran I statistic standard deviate = 8.6545, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.2115698419 -0.0089133777 0.0006490268
# try cross validation
set.seed(44)
df <- sl19num
geometry <- df$geometry
df <- df %>% st_drop_geometry()
cv <- trainControl(method = 'cv', number = 5)
lm.cv <- train(hv271 ~ ., data=df, method="lm", trControl = cv)
print(lm.cv) # RMSE 21674
## Linear Regression
##
## 553 samples
## 21 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 443, 441, 442, 444, 442
## Resampling results:
##
## RMSE Rsquared MAE
## 21964.89 0.9389016 16959.22
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
rmse.lm.cv <- sqrt(mean(residuals(lm.cv)^2))
rmse.lm.cv # 19785
## [1] 20066.05
# try again without variables p > 0.5
sigColumns <- c("hv206","hv207","hv208","hv209","hv210","hv212","hv221","hv227","hv243a","hv243b","hv243c","hv243d","hv243e","hv246c","hv246d","hv246e","hv246f","hv247", "hv271")
df <- df[,sigColumns]
lm.improved <- lm(hv271 ~ ., data=df)
summary(lm.improved)
##
## Call:
## lm(formula = hv271 ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72583 -13485 -108 12712 78892
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -85548.0 4887.0 -17.505 < 2e-16 ***
## hv206 40951.4 12741.0 3.214 0.001387 **
## hv207 35555.6 6808.6 5.222 2.54e-07 ***
## hv208 56674.1 17938.5 3.159 0.001671 **
## hv209 24951.5 17640.1 1.414 0.157805
## hv210 46984.3 15498.0 3.032 0.002550 **
## hv212 138868.5 24685.3 5.626 2.99e-08 ***
## hv221 79086.8 63730.7 1.241 0.215168
## hv227 -37635.1 5184.3 -7.259 1.38e-12 ***
## hv243a 99825.5 6241.3 15.994 < 2e-16 ***
## hv243b 8014.8 6173.8 1.298 0.194781
## hv243c -172266.3 103151.8 -1.670 0.095500 .
## hv243d -57425.1 16758.9 -3.427 0.000658 ***
## hv243e 17334.9 23992.6 0.723 0.470297
## hv246c 35117.6 12827.0 2.738 0.006392 **
## hv246d -11617.2 1780.8 -6.524 1.59e-10 ***
## hv246e 3677.8 2815.4 1.306 0.192019
## hv246f -2826.9 450.9 -6.270 7.46e-10 ***
## hv247 71194.5 10019.5 7.106 3.85e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20430 on 534 degrees of freedom
## Multiple R-squared: 0.949, Adjusted R-squared: 0.9472
## F-statistic: 551.5 on 18 and 534 DF, p-value: < 2.2e-16
# adjusted R-squared goes up to 0.9487
# r-squared stays up at 0.9504
# try again without variables p > 0.05
vsigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
df <- df[,vsigColumns]
lm.vimproved <- lm(hv271 ~ ., data=df)
summary(lm.vimproved)
##
## Call:
## lm(formula = hv271 ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71626 -14226 142 12286 77517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -85582.5 4860.5 -17.608 < 2e-16 ***
## hv206 38801.0 12403.5 3.128 0.001854 **
## hv207 36584.1 6540.0 5.594 3.53e-08 ***
## hv208 78537.2 15344.4 5.118 4.29e-07 ***
## hv210 50220.9 15144.3 3.316 0.000974 ***
## hv212 163418.5 20706.1 7.892 1.66e-14 ***
## hv227 -36968.6 5170.4 -7.150 2.83e-12 ***
## hv243a 101483.5 6151.2 16.498 < 2e-16 ***
## hv243d -60706.9 16682.1 -3.639 0.000300 ***
## hv246c 41122.6 12164.6 3.381 0.000776 ***
## hv246d -10197.8 1521.0 -6.704 5.10e-11 ***
## hv246f -2776.5 450.6 -6.162 1.40e-09 ***
## hv247 74367.4 9725.6 7.647 9.51e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20560 on 540 degrees of freedom
## Multiple R-squared: 0.9477, Adjusted R-squared: 0.9466
## F-statistic: 815.7 on 12 and 540 DF, p-value: < 2.2e-16
rmse.lm.vimproved <- sqrt(mean(residuals(lm.vimproved)^2))
rmse.lm.vimproved # 20049.49
## [1] 20313.64
# adjusted R-squared goes up to 0.9517
# r-squared goes up at 0.9531!
# much better model
# try cross validation again with smaller model
set.seed(44)
vsigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
df <- df[,vsigColumns]
lm.cv.improved <- train(hv271 ~ ., data=df, method="lm", trControl = cv)
print(lm.cv.improved) # RMSE 20902
## Linear Regression
##
## 553 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 443, 441, 442, 444, 442
## Resampling results:
##
## RMSE Rsquared MAE
## 21020.78 0.9445597 16568.09
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
rmse.lm.cv.improved <- sqrt(mean(residuals(lm.cv.improved)^2))
rmse.lm.cv.improved # 20049.49
## [1] 20313.64
# map null model
ggplot() +
geom_sf(data=areas.sf, fill="darkgreen", color="black", alpha=0.5) +
ggtitle("Null Model of Wealth Index across Sierra Leone")
# map null model residuals
df$geometry <- geometry
df$residuals <- residuals(lm.mean)
ggplot(df) +
geom_sf(aes(fill=residuals, geometry=geometry), color="transparent") +
scale_fill_gradient2() +
ggtitle("Map of Null Model Residuals")
# map residuals
df$geometry <- geometry
df$residuals <- residuals(lm.basic)
ggplot(df) +
geom_sf(aes(fill=residuals, geometry=geometry), color="transparent") +
scale_fill_gradient2() +
ggtitle("Map of Linear Model Residuals")
# histogram of residuals
ggplot(df) +
geom_histogram(aes(residuals), fill="darkgreen", bins = 20)
# save down significant columns df
df$geometry <- geometry
sl19lm <- df
saveRDS(sl19lm, "./exports/sl19lm.rds")
# read in data
sl19num <- readRDS("./exports/sl19num.rds")
df <- sl19num
# split train / test data
f = 4/5
n = nrow(df)
set.seed(44)
train <- sample(n, n*f, replace=FALSE)
traindf <- df[train,]
testdf <- df[-train,]
# prep for lm
geometry <- testdf$geometry
traindf <- traindf %>% st_drop_geometry()
testdf <- testdf %>% st_drop_geometry()
df <- traindf
# run lm
lm.train <- lm(hv271 ~ ., data=df)
summary(lm.train)
##
## Call:
## lm(formula = hv271 ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69095 -14052 -1130 11336 76074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87027.38 5585.93 -15.580 < 2e-16 ***
## hv206 35055.57 14888.17 2.355 0.019002 *
## hv207 32463.10 7943.36 4.087 5.24e-05 ***
## hv208 73026.58 20767.20 3.516 0.000485 ***
## hv209 14419.92 20050.49 0.719 0.472430
## hv210 43473.13 17847.42 2.436 0.015273 *
## hv211 -8287.26 13130.91 -0.631 0.528302
## hv212 156336.70 27863.84 5.611 3.66e-08 ***
## hv221 92167.96 69265.86 1.331 0.184029
## hv227 -34306.70 5974.95 -5.742 1.80e-08 ***
## hv243a 100099.22 7221.33 13.862 < 2e-16 ***
## hv243b 6339.94 7235.74 0.876 0.381423
## hv243c -156781.23 116386.80 -1.347 0.178684
## hv243d -47068.43 18763.53 -2.509 0.012500 *
## hv243e -7619.45 26637.00 -0.286 0.774983
## hv246a -53.72 3272.63 -0.016 0.986910
## hv246b 418.30 1156.62 0.362 0.717789
## hv246c 107440.34 48190.51 2.229 0.026309 *
## hv246d -13013.17 2317.69 -5.615 3.58e-08 ***
## hv246e 7240.95 3488.86 2.075 0.038553 *
## hv246f -2688.34 513.13 -5.239 2.56e-07 ***
## hv247 76855.74 12023.27 6.392 4.35e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20600 on 420 degrees of freedom
## Multiple R-squared: 0.95, Adjusted R-squared: 0.9475
## F-statistic: 379.9 on 21 and 420 DF, p-value: < 2.2e-16
pred.lm <- predict(object = lm.train, newdata = testdf)
rmse.lm <- rmse(actual = testdf$hv271, predicted = pred.lm )
print(rmse.lm) # RMSE 22393 (worse than CV)
## [1] 24201.98
# try a test/train split again with fewer columns
vsigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
df <- traindf[,vsigColumns]
testdf <- testdf[,vsigColumns]
lm.train.improved <- lm(hv271 ~ ., data=df)
summary(lm.train.improved) # R2 0.9531
##
## Call:
## lm(formula = hv271 ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70425 -14199 -1060 12158 75068
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87112.2 5420.4 -16.071 < 2e-16 ***
## hv206 34706.0 14339.1 2.420 0.0159 *
## hv207 31768.7 7484.6 4.245 2.69e-05 ***
## hv208 84842.3 17248.9 4.919 1.24e-06 ***
## hv210 43422.7 17219.8 2.522 0.0120 *
## hv212 167087.5 22586.1 7.398 7.35e-13 ***
## hv227 -34614.7 5798.4 -5.970 4.99e-09 ***
## hv243a 102425.8 6886.6 14.873 < 2e-16 ***
## hv243d -47348.7 18403.9 -2.573 0.0104 *
## hv246c 112947.9 47465.8 2.380 0.0178 *
## hv246d -9791.0 1798.0 -5.445 8.72e-08 ***
## hv246f -2562.7 511.4 -5.011 7.93e-07 ***
## hv247 75569.4 11505.6 6.568 1.48e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20670 on 429 degrees of freedom
## Multiple R-squared: 0.9486, Adjusted R-squared: 0.9471
## F-statistic: 659.4 on 12 and 429 DF, p-value: < 2.2e-16
pred.lm.improved <- predict(object = lm.train.improved, newdata = testdf)
rmse.lm.improved <- rmse(actual = testdf$hv271, predicted = pred.lm.improved )
print(rmse.lm.improved) # RMSE 22619 (not better)
## [1] 23592.15
# try a test/train split with few columns and cross validation
df <- traindf[,vsigColumns]
testdf <- testdf[,vsigColumns]
cv <- trainControl(method = 'cv', number = 5)
lm.cv.train <- train(hv271 ~ ., data=df, method="lm", trControl = cv)
print(lm.cv.train) # RMSE 20651, R-squared 0.9487
## Linear Regression
##
## 442 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 354, 354, 353, 354, 353
## Resampling results:
##
## RMSE Rsquared MAE
## 20867.56 0.9454514 16357.48
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
pred.lm.cv.train <- predict(object = lm.cv.train, newdata = testdf)
rmse.lm.cv.train <- rmse(actual = testdf$hv271, predicted = pred.lm.cv.train )
print(rmse.lm.cv.train) # RMSE 22619.46 (not better - exactly the same as 80/20)
## [1] 23592.15
# get same data as for regression models
sl19num <- readRDS("./exports/sl19num.rds")
df <- sl19num
geometry <- df$geometry
# define neighbors & weights
queen_neighbors <- spdep::poly2nb(geometry)
queen_weights <- spdep::nb2listw(queen_neighbors, zero.policy = TRUE)
# null spatial lag model
df <- df %>% st_drop_geometry()
lm.lag.null <- lagsarlm(hv271 ~ 1, data=df, queen_weights, zero.policy = TRUE)
## Warning in lagsarlm(hv271 ~ 1, data = df, queen_weights, zero.policy = TRUE): inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## reciprocal condition number = 3.09995e-20 - using numerical Hessian.
summary(lm.lag.null)
##
## Call:lagsarlm(formula = hv271 ~ 1, data = df, listw = queen_weights,
## zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -135748.5 -30642.3 -8229.1 22728.4 181907.1
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 899.37 1805.88 0.498 0.6185
##
## Rho: 0.85025, LR test value: 623.81, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.020406
## z-value: 41.667, p-value: < 2.22e-16
## Wald statistic: 1736.1, p-value: < 2.22e-16
##
## Log likelihood: -6773.982 for lag model
## ML residual variance (sigma squared): 2082300000, (sigma: 45633)
## Number of observations: 553
## Number of parameters estimated: 3
## AIC: 13554, (AIC for lm: 14176)
rmse.lm.lag.null <- sqrt(mean(residuals(lm.lag.null)^2))
rmse.lm.lag.null # 45131
## [1] 45632.65
# spatial lag model
lm.lag <- lagsarlm(hv271 ~ ., data=df, queen_weights, zero.policy = TRUE)
## Warning in lagsarlm(hv271 ~ ., data = df, queen_weights, zero.policy = TRUE): inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## reciprocal condition number = 2.2272e-19 - using numerical Hessian.
## Warning in sqrt(diag(fdHess)[-1]): NaNs produced
summary(lm.lag)
##
## Call:lagsarlm(formula = hv271 ~ ., data = df, listw = queen_weights,
## zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61655.84 -13567.67 -390.47 11843.22 74602.26
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -79637.722 4597.905 -17.3204 < 2.2e-16
## hv206 35222.625 11267.725 3.1260 0.001772
## hv207 32023.061 6249.282 5.1243 2.987e-07
## hv208 34813.509 17214.024 2.0224 0.043136
## hv209 12328.026 14030.529 0.8787 0.379587
## hv210 39684.103 13571.732 2.9240 0.003455
## hv211 17126.059 10376.510 1.6505 0.098848
## hv212 136616.393 22624.041 6.0385 1.555e-09
## hv221 57276.780 57173.736 1.0018 0.316439
## hv227 -27130.154 4825.654 -5.6221 1.887e-08
## hv243a 84287.480 6108.230 13.7990 < 2.2e-16
## hv243b 8702.867 4638.643 1.8762 0.060632
## hv243c -159340.756 96454.466 -1.6520 0.098539
## hv243d -44894.176 15730.744 -2.8539 0.004318
## hv243e 32089.657 20878.524 1.5370 0.124301
## hv246a -259.343 NaN NaN NaN
## hv246b 11.338 NaN NaN NaN
## hv246c 29966.902 9492.951 3.1568 0.001595
## hv246d -10072.715 1587.216 -6.3462 2.208e-10
## hv246e 3620.080 2040.447 1.7742 0.076037
## hv246f -2520.584 411.986 -6.1181 9.468e-10
## hv247 78276.090 9322.626 8.3964 < 2.2e-16
##
## Rho: 0.18349, LR test value: 64.899, p-value: 7.7716e-16
## Approximate (numerical Hessian) standard error: 0.021884
## z-value: 8.3846, p-value: < 2.22e-16
## Wald statistic: 70.302, p-value: < 2.22e-16
##
## Log likelihood: -6230.676 for lag model
## ML residual variance (sigma squared): 355790000, (sigma: 18862)
## Number of observations: 553
## Number of parameters estimated: 24
## AIC: 12509, (AIC for lm: 12572)
# AIC 12040 instead of 12104 for lm
# ro = 0.1864, p-value << 0.05
# pull out significant columns from spatial model
spSigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
# same as vsigColumns from non-spatial model
# rerun model with fewer columns
df <- df[,spSigColumns]
lm.lag.improved <- lagsarlm(hv271 ~ ., data=df, queen_weights, zero.policy = TRUE)
## Warning in lagsarlm(hv271 ~ ., data = df, queen_weights, zero.policy = TRUE): inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## reciprocal condition number = 2.16936e-19 - using numerical Hessian.
summary(lm.lag.improved)
##
## Call:lagsarlm(formula = hv271 ~ ., data = df, listw = queen_weights,
## zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60135.0 -14010.9 -1344.4 11931.0 73014.8
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -81085.38 4559.35 -17.7844 < 2.2e-16
## hv206 29426.44 11643.84 2.5272 0.0114972
## hv207 34465.04 6100.06 5.6499 1.605e-08
## hv208 55376.63 14600.72 3.7927 0.0001490
## hv210 49730.85 14125.72 3.5206 0.0004306
## hv212 159744.17 19277.61 8.2865 2.220e-16
## hv227 -25589.95 5012.70 -5.1050 3.308e-07
## hv243a 88543.80 5936.76 14.9145 < 2.2e-16
## hv243d -52392.18 15545.11 -3.3703 0.0007508
## hv246c 37277.61 11338.89 3.2876 0.0010105
## hv246d -8424.34 1432.49 -5.8809 4.080e-09
## hv246f -2507.38 420.91 -5.9570 2.569e-09
## hv247 82821.39 9112.14 9.0891 < 2.2e-16
##
## Rho: 0.17731, LR test value: 62.742, p-value: 2.3315e-15
## Approximate (numerical Hessian) standard error: 0.021657
## z-value: 8.1871, p-value: 2.2204e-16
## Wald statistic: 67.028, p-value: 2.2204e-16
##
## Log likelihood: -6238.535 for lag model
## ML residual variance (sigma squared): 366200000, (sigma: 19136)
## Number of observations: 553
## Number of parameters estimated: 15
## AIC: 12507, (AIC for lm: 12568)
rmse.lm.lag.improved <- sqrt(mean(residuals(lm.lag.improved)^2))
rmse.lm.lag.improved # 18807.81
## [1] 19136.45
# AIC 12038 instead of 12100 for lm
# ro = 0.18041, p-value << 0.05
# map residuals
df$geometry <- geometry
df$residuals <- residuals(lm.lag.improved)
ggplot(df) +
geom_sf(aes(fill=residuals, geometry=geometry), color="transparent") +
scale_fill_gradient2() +
ggtitle("Map of Spatial Lag Model Residuals")
# histogram of residuals
ggplot(df) +
geom_histogram(aes(residuals), fill="purple", bins = 15)
# get same data as for regression models
sl19num <- readRDS("./exports/sl19num.rds")
df <- sl19num
geometry <- df$geometry
df <- st_drop_geometry(df)
# define neighbors & weights
queen_neighbors <- spdep::poly2nb(geometry)
queen_weights <- spdep::nb2listw(queen_neighbors, zero.policy = TRUE)
# try spatial error model
lm.err <- errorsarlm(hv271 ~ ., data=df, queen_weights, zero.policy = TRUE)
## Warning in errorsarlm(hv271 ~ ., data = df, queen_weights, zero.policy = TRUE): inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## reciprocal condition number = 5.38187e-18 - using numerical Hessian.
summary(lm.err)
##
## Call:errorsarlm(formula = hv271 ~ ., data = df, listw = queen_weights,
## zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60639.7 -12600.8 -1148.3 11869.7 64050.2
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -84370.11 5078.74 -16.6124 < 2.2e-16
## hv206 49716.46 11724.72 4.2403 2.232e-05
## hv207 28237.25 6471.72 4.3632 1.282e-05
## hv208 58050.38 16494.38 3.5194 0.0004325
## hv209 20672.88 15662.18 1.3199 0.1868606
## hv210 35733.73 14387.61 2.4836 0.0130045
## hv211 22711.01 10341.79 2.1960 0.0280888
## hv212 110421.83 22522.82 4.9027 9.454e-07
## hv221 36088.67 56531.09 0.6384 0.5232223
## hv227 -25031.13 5385.39 -4.6480 3.352e-06
## hv243a 83401.67 6222.69 13.4028 < 2.2e-16
## hv243b 11192.76 6390.94 1.7513 0.0798861
## hv243c -229653.33 91738.11 -2.5034 0.0123021
## hv243d -66884.89 18074.79 -3.7005 0.0002152
## hv243e 25546.14 20979.65 1.2177 0.2233522
## hv246a 1059.11 2779.37 0.3811 0.7031569
## hv246b 1079.65 1045.09 1.0331 0.3015738
## hv246c 39743.94 11213.64 3.5442 0.0003937
## hv246d -10596.08 1712.59 -6.1872 6.126e-10
## hv246e 254.10 2752.53 0.0923 0.9264469
## hv246f -2332.01 423.46 -5.5070 3.650e-08
## hv247 71971.69 9398.45 7.6578 1.887e-14
##
## Lambda: 0.52797, LR test value: 70.433, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.052836
## z-value: 9.9927, p-value: < 2.22e-16
## Wald statistic: 99.853, p-value: < 2.22e-16
##
## Log likelihood: -6227.908 for error model
## ML residual variance (sigma squared): 333640000, (sigma: 18266)
## Number of observations: 553
## Number of parameters estimated: 24
## AIC: 12504, (AIC for lm: 12572)
# AIC 12030 instead of 12104 for lm
# lambda = 0.53617, p-value << 0.05
# pull out significant columns from spatial model
spSigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
# same as vsigColumns from non-spatial model
# rerun model with fewer columns
df <- df[,spSigColumns]
lm.err.improved <- errorsarlm(hv271 ~ ., data=df, queen_weights, zero.policy = TRUE)
## Warning in errorsarlm(hv271 ~ ., data = df, queen_weights, zero.policy = TRUE): inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## reciprocal condition number = 5.48439e-18 - using numerical Hessian.
summary(lm.err.improved)
##
## Call:errorsarlm(formula = hv271 ~ ., data = df, listw = queen_weights,
## zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60613.58 -13156.52 -413.76 11717.17 65445.18
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -85801.72 5039.91 -17.0244 < 2.2e-16
## hv206 45994.37 11726.71 3.9222 8.775e-05
## hv207 31630.05 6220.46 5.0848 3.679e-07
## hv208 77151.88 14939.90 5.1641 2.415e-07
## hv210 43544.05 14207.27 3.0649 0.002177
## hv212 133253.86 18932.13 7.0385 1.943e-12
## hv227 -24403.04 5394.83 -4.5234 6.085e-06
## hv243a 89899.05 6020.66 14.9318 < 2.2e-16
## hv243d -76285.03 17926.24 -4.2555 2.086e-05
## hv246c 42115.96 10629.97 3.9620 7.432e-05
## hv246d -9963.92 1540.35 -6.4686 9.890e-11
## hv246f -2373.07 429.99 -5.5189 3.412e-08
## hv247 79620.56 9113.82 8.7362 < 2.2e-16
##
## Lambda: 0.48308, LR test value: 63.831, p-value: 1.3323e-15
## Approximate (numerical Hessian) standard error: 0.053024
## z-value: 9.1105, p-value: < 2.22e-16
## Wald statistic: 83.002, p-value: < 2.22e-16
##
## Log likelihood: -6237.991 for error model
## ML residual variance (sigma squared): 349880000, (sigma: 18705)
## Number of observations: 553
## Number of parameters estimated: 15
## AIC: 12506, (AIC for lm: 12568)
rmse.lm.err.improved <- sqrt(mean(residuals(lm.err.improved)^2))
rmse.lm.err.improved # 18228.15
## [1] 18704.97
# AIC 12030 instead of 12100 for lm
# lamda = 0.5018, p-value << 0.05
# map residuals
df$geometry <- geometry
df$residuals <- residuals(lm.err.improved)
ggplot(df) +
geom_sf(aes(fill=residuals, geometry=geometry), color="transparent") +
scale_fill_gradient2() +
ggtitle("Map of Spatial Error Model Residuals")
# histogram of residuals
ggplot(df) +
geom_histogram(aes(residuals), fill="blue", bins = 15)
### Run Significance Tests
# read in lm dataframe
sl19lm <- readRDS("./exports/sl19lm.rds")
df <- sl19lm
df <- df[-c(47, 68), ] # try removing regions with no neighbors
geometry <- df$geometry
df = select(df, -geometry)
# define neighbors & weights
queen_neighbors <- spdep::poly2nb(geometry)
queen_weights <- spdep::nb2listw(queen_neighbors, zero.policy = TRUE)
# basic linear model
lm.basic <- lm(hv271 ~ ., data=df)
# run lm tests
lmtests <- lm.LMtests(lm.basic, listw=queen_weights, test="all")
summary(lmtests)
## Lagrange multiplier diagnostics for spatial dependence
## data:
## model: lm(formula = hv271 ~ ., data = df)
## weights: queen_weights
##
## statistic parameter p.value
## LMerr 12.5857 1 0.0003887 ***
## LMlag 2.1162 1 0.1457512
## RLMerr 12.0503 1 0.0005178 ***
## RLMlag 1.5808 1 0.2086464
## SARMA 14.1665 2 0.0008391 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## all very significant!
# statistic parameter p.value
# LMerr 77.846 1 < 2.2e-16 ***
# LMlag 65.565 1 5.551e-16 ***
# RLMerr 41.468 1 1.198e-10 ***
# RLMlag 29.186 1 6.574e-08 ***
# RLMerr has a p-value 600X smaller than RLM lag, so we'll go with the spatial error model!
# I guess this tells us that there's sysStematic bias in the errors that we're not accounting for. Likely from ommitted variables....probably the categorical variables that I wasn't able to include.
# import sf dataframe
sl19num <- readRDS("./exports/sl19num.rds")
df <- sl19num
# cut down to significant columns
spSigColumns <- c("hv206","hv207","hv208","hv210","hv212","hv227","hv243a","hv243d","hv246c","hv246d","hv246f","hv247", "hv271")
df <- df[,spSigColumns]
# convert to a spatial polygon df
df.sp <- as(df, "Spatial")
# find the right bandwidth for gwr
bw <- bw.gwr(hv271 ~ ., data = df.sp, adaptive=T)
## Adaptive bandwidth: 349 CV score: 2.19461e+11
## Adaptive bandwidth: 224 CV score: 228092871703
## Adaptive bandwidth: 428 CV score: 217558175915
## Adaptive bandwidth: 475 CV score: 218963482933
## Adaptive bandwidth: 397 CV score: 216315109381
## Adaptive bandwidth: 379 CV score: 217028630754
## Adaptive bandwidth: 409 CV score: 216915508517
## Adaptive bandwidth: 390 CV score: 216187113706
## Adaptive bandwidth: 385 CV score: 216240570769
## Adaptive bandwidth: 392 CV score: 216414476082
## Adaptive bandwidth: 387 CV score: 2.16179e+11
## Adaptive bandwidth: 387 CV score: 2.16179e+11
bw # 385
## [1] 387
# run gwr
lm.gwr <- gwr.basic(hv271 ~ ., data=df.sp, bw=bw, kernel="gaussian", adaptive=TRUE)
lm.gwr
## ***********************************************************************
## * Package GWmodel *
## ***********************************************************************
## Program starts at: 2024-03-03 20:42:10
## Call:
## gwr.basic(formula = hv271 ~ ., data = df.sp, bw = bw, kernel = "gaussian",
## adaptive = TRUE)
##
## Dependent (y) variable: hv271
## Independent variables: .
## Number of data points: 553
## ***********************************************************************
## * Results of Global Regression *
## ***********************************************************************
##
## Call:
## lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71626 -14226 142 12286 77517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -85582.5 4860.5 -17.608 < 2e-16 ***
## hv206 38801.0 12403.5 3.128 0.001854 **
## hv207 36584.1 6540.0 5.594 3.53e-08 ***
## hv208 78537.2 15344.4 5.118 4.29e-07 ***
## hv210 50220.9 15144.3 3.316 0.000974 ***
## hv212 163418.5 20706.1 7.892 1.66e-14 ***
## hv227 -36968.6 5170.4 -7.150 2.83e-12 ***
## hv243a 101483.5 6151.2 16.498 < 2e-16 ***
## hv243d -60706.9 16682.1 -3.639 0.000300 ***
## hv246c 41122.6 12164.6 3.381 0.000776 ***
## hv246d -10197.8 1521.0 -6.704 5.10e-11 ***
## hv246f -2776.5 450.6 -6.162 1.40e-09 ***
## hv247 74367.4 9725.6 7.647 9.51e-14 ***
##
## ---Significance stars
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Residual standard error: 20560 on 540 degrees of freedom
## Multiple R-squared: 0.9477
## Adjusted R-squared: 0.9466
## F-statistic: 815.7 on 12 and 540 DF, p-value: < 2.2e-16
## ***Extra Diagnostic information
## Residual sum of squares: 228192061099
## Sigma(hat): 20350.47
## AIC: 12567.81
## AICc: 12568.59
## BIC: 12163.64
## ***********************************************************************
## * Results of Geographically Weighted Regression *
## ***********************************************************************
##
## *********************Model calibration information*********************
## Kernel function: gaussian
## Adaptive bandwidth: 387 (number of nearest neighbours)
## Regression points: the same locations as observations are used.
## Distance metric: Euclidean distance metric is used.
##
## ****************Summary of GWR coefficient estimates:******************
## Min. 1st Qu. Median 3rd Qu. Max.
## Intercept -90492.7 -88417.6 -85855.7 -82925.8 -81703.5
## hv206 33103.6 34079.4 38367.3 42248.7 45783.0
## hv207 34832.4 35288.4 35513.6 36139.2 36830.4
## hv208 75002.5 77148.8 79359.3 79868.9 80604.3
## hv210 45836.3 46893.1 49596.8 53661.8 54412.5
## hv212 154268.1 158197.6 165619.3 174422.8 181273.6
## hv227 -42691.9 -41208.9 -36599.9 -32925.4 -29711.8
## hv243a 95410.0 97891.9 100645.9 105908.9 107147.8
## hv243d -62528.1 -61565.8 -61222.3 -60670.4 -58660.5
## hv246c 37837.3 40219.5 43696.4 46476.7 50121.3
## hv246d -12768.4 -12186.7 -10699.3 -9601.4 -8840.6
## hv246f -2841.2 -2780.1 -2668.3 -2568.6 -2469.7
## hv247 72763.5 73248.8 75549.1 77002.0 77868.4
## ************************Diagnostic information*************************
## Number of data points: 553
## Effective number of parameters (2trace(S) - trace(S'S)): 19.60113
## Effective degrees of freedom (n-2trace(S) + trace(S'S)): 533.3989
## AICc (GWR book, Fotheringham, et al. 2002, p. 61, eq 2.33): 12538.8
## AIC (GWR book, Fotheringham, et al. 2002,GWR p. 96, eq. 4.22): 12518.97
## BIC (GWR book, Fotheringham, et al. 2002,GWR p. 61, eq. 2.34): 12054.22
## Residual sum of squares: 213252104866
## R-square value: 0.9511415
## Adjusted R-square value: 0.9493427
##
## ***********************************************************************
## Program stops at: 2024-03-03 20:42:10
# Map GWR Coefficients Across the Country
# electricity
df$gwr_hv206 <- lm.gwr$SDF$hv206
ggplot(data=df) +
geom_sf(aes(fill=gwr_hv206)) +
scale_fill_gradient2() +
ggtitle("GWR Electricity Coefficients")
# this means that electricity is a better predictor of wealth the farther we get from Freetown
# radio
df$gwr_hv207 <- lm.gwr$SDF$hv207
ggplot(data=df) +
geom_sf(aes(fill=gwr_hv207)) +
scale_fill_gradient2() +
ggtitle("GWR Radio Coefficients")
# tv
df$gwr_hv208 <- lm.gwr$SDF$hv208
ggplot(data=df) +
geom_sf(aes(fill=gwr_hv208)) +
scale_fill_gradient2() +
ggtitle("GWR TV Coefficients")
# mobile phone
df$gwr_hv243a <- lm.gwr$SDF$hv243a
ggplot(data=df) +
geom_sf(aes(fill=gwr_hv243a)) +
scale_fill_gradient2() +
ggtitle("GWR Mobile Phone Coefficients")
# horses / donkeys / mules
df$gwr_hv246c <- lm.gwr$SDF$hv246c
ggplot(data=df) +
geom_sf(aes(fill=gwr_hv246c)) +
scale_fill_gradient2() +
ggtitle("GWR Horse/Donkey/Mule Coefficients")
# goats
df$gwr_hv246d <- lm.gwr$SDF$hv246d
ggplot(data=df) +
geom_sf(aes(fill=gwr_hv246d)) +
scale_fill_gradient2() +
ggtitle("GWR Goats Coefficients")
# woah, owning goats means you're poor! But more so in the more urban, western region
# chickens
df$gwr_hv246f <- lm.gwr$SDF$hv246f
ggplot(data=df) +
geom_sf(aes(fill=gwr_hv246f)) +
scale_fill_gradient2() +
ggtitle("GWR Chicken Coefficients")
# woah, very negatively correlated with wealth! everywhere.
# bank account
df$gwr_hv247 <- lm.gwr$SDF$hv247
ggplot(data=df) +
geom_sf(aes(fill=gwr_hv247)) +
scale_fill_gradient2() +
ggtitle("GWR Bank Account Coefficients")
# read in data
sl19clean <- readRDS("./exports/sl19clean.rds")
df <- sl19clean
# filter down to columns of interest
dTreeCols <- c("hv010","hv011","hv012","hv014","hv025","hv040","hv045c","hv201","hv204","hv205","hv206","hv207","hv208","hv209","hv210","hv211","hv212","hv213","hv214","hv215","hv216","hv220","hv221","hv226","hv227","hv230a","hv237","hv241","hv243a","hv243b","hv243c","hv243d","hv243e","hv244","hv246","hv246a","hv246b","hv246c","hv246d","hv246e","hv246f","hv247","hv271")
df <- sl19clean[,dTreeCols]
# clean data
df <- na.omit(df)
dTreeFactors <- c("hv025","hv045c","hv201","hv205","hv206","hv207","hv208","hv209","hv210","hv211","hv212","hv213","hv214","hv215","hv221","hv226","hv227","hv230a","hv237","hv241","hv243a","hv243b","hv243c","hv243d","hv243e","hv244","hv246","hv247")
df <- df %>% mutate_at(dTreeFactors, as.factor)
str(df)
## 'data.frame': 12768 obs. of 43 variables:
## $ hv010 : num 0 0 1 1 1 2 0 0 1 1 ...
## $ hv011 : num 1 0 0 0 0 0 1 0 0 0 ...
## $ hv012 : num 1 5 1 5 22 8 5 2 8 10 ...
## $ hv014 : num 0 0 0 0 1 1 2 1 2 1 ...
## $ hv025 : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hv040 : num 46 46 46 46 46 46 46 46 46 46 ...
## $ hv045c: Factor w/ 6 levels "1","2","3","4",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ hv201 : Factor w/ 15 levels "11","12","13",..: 4 5 10 4 10 10 10 10 5 5 ...
## $ hv204 : dbl+lbl [1:12768] 20, 25, 60, 20, 30, 40, 40, 20, 30, 35, 2...
## ..@ label : chr "time to get to water source (minutes)"
## ..@ format.stata: chr "%8.0g"
## ..@ labels : Named num 996 998
## .. ..- attr(*, "names")= chr [1:2] "on premises" "don't know"
## $ hv205 : Factor w/ 12 levels "11","12","13",..: 9 8 8 7 9 9 7 8 8 8 ...
## $ hv206 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hv207 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
## $ hv208 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hv209 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hv210 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hv211 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hv212 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hv213 : Factor w/ 10 levels "11","12","21",..: 1 1 8 1 1 8 1 1 1 8 ...
## $ hv214 : Factor w/ 16 levels "11","12","13",..: 4 4 4 3 4 10 4 4 4 10 ...
## $ hv215 : Factor w/ 14 levels "11","12","13",..: 5 8 2 5 8 8 5 2 8 8 ...
## $ hv216 : num 3 2 1 4 4 4 2 1 3 4 ...
## $ hv220 : dbl+lbl [1:12768] 48, 76, 35, 73, 58, 60, 60, 51, 41, 31, 50, 32, 51, ...
## ..@ label : chr "age of head of household"
## ..@ format.stata: chr "%8.0g"
## ..@ labels : Named num 95 98
## .. ..- attr(*, "names")= chr [1:2] "95+" "don't know"
## $ hv221 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hv226 : Factor w/ 10 levels "1","2","3","4",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ hv227 : Factor w/ 2 levels "0","1": 2 2 1 2 2 2 2 2 2 1 ...
## $ hv230a: Factor w/ 5 levels "1","2","3","4",..: 3 2 2 3 3 3 3 2 2 2 ...
## $ hv237 : Factor w/ 3 levels "0","1","8": 2 2 1 2 1 1 1 1 2 2 ...
## $ hv241 : Factor w/ 4 levels "1","2","3","6": 2 2 2 2 2 2 2 2 2 2 ...
## $ hv243a: Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 2 1 ...
## $ hv243b: Factor w/ 2 levels "0","1": 2 1 1 1 2 1 1 1 1 1 ...
## $ hv243c: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hv243d: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hv243e: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hv244 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 1 ...
## $ hv246 : Factor w/ 2 levels "0","1": 2 2 1 2 2 1 1 2 2 2 ...
## $ hv246a: dbl+lbl [1:12768] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "owns cattle"
## ..@ format.stata: chr "%8.0g"
## ..@ labels : Named num 0 95 98
## .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
## $ hv246b: dbl+lbl [1:12768] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "owns cows/ bulls"
## ..@ format.stata: chr "%8.0g"
## ..@ labels : Named num 0 95 98
## .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
## $ hv246c: dbl+lbl [1:12768] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "owns horses/ donkeys/ mules"
## ..@ format.stata: chr "%8.0g"
## ..@ labels : Named num 0 95 98
## .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
## $ hv246d: dbl+lbl [1:12768] 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "owns goats"
## ..@ format.stata: chr "%8.0g"
## ..@ labels : Named num 0 95 98
## .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
## $ hv246e: dbl+lbl [1:12768] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "owns sheep"
## ..@ format.stata: chr "%8.0g"
## ..@ labels : Named num 0 95 98
## .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
## $ hv246f: dbl+lbl [1:12768] 6, 1, 0, 20, 20, 0, 0, 10, 20, 5, 6, 0, 0, ...
## ..@ label : chr "owns chickens/poultry"
## ..@ format.stata: chr "%8.0g"
## ..@ labels : Named num 0 95 98
## .. ..- attr(*, "names")= chr [1:3] "none" "95 or more" "unknown"
## $ hv247 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hv271 : num -102100 -89700 -88194 -94742 -88105 ...
## - attr(*, "na.action")= 'omit' Named int [1:195] 99 102 107 489 560 568 620 645 815 1128 ...
## ..- attr(*, "names")= chr [1:195] "99" "102" "107" "489" ...
# split into test and train data
f = 4/5
n = nrow(df)
set.seed(44)
# random sample without replacement
train <- sample(n, n*f, replace=FALSE)
traindf <- df[train,]
testdf <- df[-train,]
# Decision Tree with 5-Fold Cross Validation
cv <- trainControl(method = 'cv', number = 5)
dt1 <- train(hv271 ~ ., data=df, method="rpart", trControl = cv)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
print(dt1)
## CART
##
## 12768 samples
## 42 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 10215, 10214, 10215, 10215, 10213
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.07562718 48749.35 0.7549329 35522.23
## 0.09826903 55382.79 0.6859931 42028.31
## 0.63002492 75844.64 0.6247707 60019.59
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.07562718.
rpart.plot(dt1$finalModel, main="Decision Tree with 5-Fold Cross Validation")
pred1 <- predict(object = dt1, newdata = testdf)
rmse1 <- rmse(actual = testdf$hv271, predicted = pred1 )
rmse1
## [1] 53566.91
## DTree without Cross Validation (manually set cp=0.005)
cp2 <- 0.005
dt2 <- rpart(hv271 ~ ., data = traindf, cp=cp2, method="anova")
summary(dt2)
## Call:
## rpart(formula = hv271 ~ ., data = traindf, method = "anova",
## cp = cp2)
## n= 10214
##
## CP nsplit rel error xerror xstd
## 1 0.630948665 0 1.00000000 1.00033580 0.015470975
## 2 0.101162893 1 0.36905134 0.37026394 0.007821592
## 3 0.098306311 2 0.26788844 0.28018053 0.006777022
## 4 0.028664760 3 0.16958213 0.17067682 0.005052434
## 5 0.018954143 4 0.14091737 0.14136599 0.004017624
## 6 0.006914147 5 0.12196323 0.12296853 0.003589458
## 7 0.006705071 6 0.11504908 0.11708152 0.003517400
## 8 0.006671769 7 0.10834401 0.11346611 0.003507403
## 9 0.006143629 8 0.10167224 0.10535782 0.003433444
## 10 0.005000000 9 0.09552861 0.09785657 0.003361256
##
## Variable importance
## hv226 hv025 hv206 hv208 hv209 hv244 hv213 hv214 hv247 hv243b hv205
## 26 15 15 15 9 6 6 3 2 1 1
##
## Node number 1: 10214 observations, complexity param=0.6309487
## mean=-1603.993, MSE=9.621312e+09
## left son=2 (7264 obs) right son=3 (2950 obs)
## Primary splits:
## hv226 splits as RRRRRRRLRR, improve=0.6309487, (0 missing)
## hv206 splits as LR, improve=0.5905257, (0 missing)
## hv208 splits as LR, improve=0.5769706, (0 missing)
## hv025 splits as RL, improve=0.5629566, (0 missing)
## hv213 splits as LLRLRRRRRR, improve=0.5082371, (0 missing)
## Surrogate splits:
## hv025 splits as RL, agree=0.873, adj=0.561, (0 split)
## hv206 splits as LR, agree=0.843, adj=0.457, (0 split)
## hv208 splits as LR, agree=0.824, adj=0.392, (0 split)
## hv209 splits as LR, agree=0.786, adj=0.258, (0 split)
## hv244 splits as RL, agree=0.778, adj=0.231, (0 split)
##
## Node number 2: 7264 observations, complexity param=0.1011629
## mean=-51256.05, MSE=2.61084e+09
## left son=4 (4754 obs) right son=5 (2510 obs)
## Primary splits:
## hv213 splits as LLRLRRRRRL, improve=0.5241979, (0 missing)
## hv214 splits as LLLLLLRRLRLLRL-R, improve=0.4770795, (0 missing)
## hv208 splits as LR, improve=0.2952548, (0 missing)
## hv025 splits as RL, improve=0.2831897, (0 missing)
## hv206 splits as LR, improve=0.2708360, (0 missing)
## Surrogate splits:
## hv214 splits as LLLLLLRRLRRLRL-R, agree=0.870, adj=0.624, (0 split)
## hv025 splits as RL, agree=0.718, adj=0.185, (0 split)
## hv205 splits as RRRR-RRLLLLL, agree=0.692, adj=0.109, (0 split)
## hv247 splits as LR, agree=0.690, adj=0.102, (0 split)
## hv206 splits as LR, agree=0.680, adj=0.073, (0 split)
##
## Node number 3: 2950 observations, complexity param=0.09830631
## mean=120657.9, MSE=5.865187e+09
## left son=6 (1557 obs) right son=7 (1393 obs)
## Primary splits:
## hv208 splits as LR, improve=0.5583515, (0 missing)
## hv209 splits as LR, improve=0.5171638, (0 missing)
## hv206 splits as LR, improve=0.4869690, (0 missing)
## hv213 splits as LRRLRRRLRL, improve=0.3516020, (0 missing)
## hv247 splits as LR, improve=0.3473129, (0 missing)
## Surrogate splits:
## hv206 splits as LR, agree=0.865, adj=0.714, (0 split)
## hv209 splits as LR, agree=0.782, adj=0.538, (0 split)
## hv247 splits as LR, agree=0.701, adj=0.367, (0 split)
## hv243b splits as LR, agree=0.679, adj=0.321, (0 split)
## hv213 splits as LRRLRRRLRR, agree=0.667, adj=0.294, (0 split)
##
## Node number 4: 4754 observations, complexity param=0.006705071
## mean=-78137.03, MSE=6.42506e+08
## left son=8 (1041 obs) right son=9 (3713 obs)
## Primary splits:
## hv215 splits as LLRRLRRRR-RR--, improve=0.2157234, (0 missing)
## hv214 splits as LLLLLL--LRLLRL-R, improve=0.1997467, (0 missing)
## hv243a splits as LR, improve=0.1941441, (0 missing)
## hv025 splits as RL, improve=0.1755999, (0 missing)
## hv205 splits as --R--RRLLLRL, improve=0.1482406, (0 missing)
## Surrogate splits:
## hv214 splits as LLRRRL--RRRRRR-R, agree=0.786, adj=0.023, (0 split)
## hv241 splits as LRRR, agree=0.784, adj=0.014, (0 split)
## hv246b < 3.5 to the right, agree=0.784, adj=0.014, (0 split)
## hv243d splits as RL, agree=0.783, adj=0.008, (0 split)
## hv246d < 18 to the right, agree=0.782, adj=0.007, (0 split)
##
## Node number 5: 2510 observations, complexity param=0.02866476
## mean=-342.8375, MSE=2.37816e+09
## left son=10 (2303 obs) right son=11 (207 obs)
## Primary splits:
## hv208 splits as LR, improve=0.4719149, (0 missing)
## hv206 splits as LR, improve=0.4138054, (0 missing)
## hv209 splits as LR, improve=0.3189899, (0 missing)
## hv247 splits as LR, improve=0.3165087, (0 missing)
## hv025 splits as RL, improve=0.2241800, (0 missing)
## Surrogate splits:
## hv206 splits as LR, agree=0.948, adj=0.367, (0 split)
## hv209 splits as LR, agree=0.940, adj=0.271, (0 split)
## hv243e splits as LR, agree=0.924, adj=0.072, (0 split)
## hv221 splits as LR, agree=0.918, adj=0.010, (0 split)
##
## Node number 6: 1557 observations, complexity param=0.006914147
## mean=66529.39, MSE=1.897943e+09
## left son=12 (220 obs) right son=13 (1337 obs)
## Primary splits:
## hv213 splits as L--L-RRRRR, improve=0.2299307, (0 missing)
## hv214 splits as LLLLLLR-RRRLRRRR, improve=0.2260315, (0 missing)
## hv206 splits as LR, improve=0.1899410, (0 missing)
## hv205 splits as LRRRLRLLLLLL, improve=0.1631874, (0 missing)
## hv243b splits as LR, improve=0.1601806, (0 missing)
## Surrogate splits:
## hv214 splits as LRLLLRR-LRRLRRRR, agree=0.885, adj=0.186, (0 split)
## hv215 splits as RL-LLRRRR-RR--, agree=0.863, adj=0.027, (0 split)
## hv226 splits as LR-RRRR-LR, agree=0.861, adj=0.014, (0 split)
## hv201 splits as RRRRRRRRRLRRRRR, agree=0.859, adj=0.005, (0 split)
## hv205 splits as RRRRRRRRRLRR, agree=0.859, adj=0.005, (0 split)
##
## Node number 7: 1393 observations, complexity param=0.01895414
## mean=181159, MSE=3.364277e+09
## left son=14 (1035 obs) right son=15 (358 obs)
## Primary splits:
## hv205 splits as LRLL-LLLL-LL, improve=0.3974583, (0 missing)
## hv213 splits as LLL-RLRLRL, improve=0.3700823, (0 missing)
## hv209 splits as LR, improve=0.3503209, (0 missing)
## hv243e splits as LR, improve=0.3433737, (0 missing)
## hv212 splits as LR, improve=0.2853069, (0 missing)
## Surrogate splits:
## hv241 splits as RLLL, agree=0.792, adj=0.190, (0 split)
## hv212 splits as LR, agree=0.781, adj=0.148, (0 split)
## hv213 splits as LLL-RLRLLL, agree=0.757, adj=0.053, (0 split)
## hv201 splits as RLLLLLLLRLLLLRL, agree=0.753, adj=0.039, (0 split)
## hv221 splits as LR, agree=0.749, adj=0.022, (0 split)
##
## Node number 8: 1041 observations
## mean=-100371.4, MSE=2.632583e+08
##
## Node number 9: 3713 observations
## mean=-71903.27, MSE=5.713709e+08
##
## Node number 10: 2303 observations, complexity param=0.006671769
## mean=-10386.46, MSE=1.10947e+09
## left son=20 (1692 obs) right son=21 (611 obs)
## Primary splits:
## hv025 splits as RL, improve=0.2566029, (0 missing)
## hv247 splits as LR, improve=0.2290053, (0 missing)
## hv205 splits as LRRR-RRLLLRL, improve=0.2029599, (0 missing)
## hv243a splits as LR, improve=0.1788949, (0 missing)
## hv207 splits as LR, improve=0.1434461, (0 missing)
## Surrogate splits:
## hv201 splits as LRRLLLLLLLL---L, agree=0.744, adj=0.036, (0 split)
## hv237 splits as LLR, agree=0.743, adj=0.033, (0 split)
## hv206 splits as LR, agree=0.739, adj=0.018, (0 split)
## hv214 splits as -LLLL-RL-LRLR--R, agree=0.738, adj=0.013, (0 split)
## hv040 < 1.5 to the right, agree=0.737, adj=0.008, (0 split)
##
## Node number 11: 207 observations
## mean=111398.6, MSE=2.884666e+09
##
## Node number 12: 220 observations
## mean=15030.9, MSE=1.294514e+09
##
## Node number 13: 1337 observations
## mean=75003.33, MSE=1.489033e+09
##
## Node number 14: 1035 observations, complexity param=0.006143629
## mean=159652.9, MSE=1.759999e+09
## left son=28 (528 obs) right son=29 (507 obs)
## Primary splits:
## hv209 splits as LR, improve=0.3314381, (0 missing)
## hv243e splits as LR, improve=0.2542904, (0 missing)
## hv247 splits as LR, improve=0.2466412, (0 missing)
## hv213 splits as LLL--LRLRL, improve=0.2332752, (0 missing)
## hv201 splits as RLLLLLLLLLLRLRR, improve=0.1663839, (0 missing)
## Surrogate splits:
## hv247 splits as LR, agree=0.611, adj=0.205, (0 split)
## hv216 < 1.5 to the left, agree=0.603, adj=0.189, (0 split)
## hv213 splits as LRL--LRLRL, agree=0.598, adj=0.179, (0 split)
## hv220 < 33.5 to the left, agree=0.587, adj=0.158, (0 split)
## hv243e splits as LR, agree=0.583, adj=0.148, (0 split)
##
## Node number 15: 358 observations
## mean=243334.7, MSE=2.799372e+09
##
## Node number 20: 1692 observations
## mean=-20525.79, MSE=7.654139e+08
##
## Node number 21: 611 observations
## mean=17691.67, MSE=9.891666e+08
##
## Node number 28: 528 observations
## mean=135985.8, MSE=9.651528e+08
##
## Node number 29: 507 observations
## mean=184300.2, MSE=1.396944e+09
rpart.plot(dt2, extra=1, main="Decision Tree with cp=0.005")
pred2 <- predict(object = dt2, newdata = testdf)
rmse2 <- rmse(actual = testdf$hv271, predicted = pred2 )
rmse2
## [1] 32329.84
dt2$variable.importance
## hv226 hv025 hv206 hv208 hv209 hv244
## 6.201391e+13 3.726203e+13 3.700438e+13 3.679614e+13 2.256233e+13 1.431361e+13
## hv213 hv214 hv247 hv243b hv205 hv215
## 1.367162e+13 6.352942e+12 4.677737e+12 3.100045e+12 2.947037e+12 6.774523e+11
## hv241 hv243e hv212 hv216 hv201 hv220
## 3.632965e+11 2.934382e+11 2.757574e+11 1.143190e+11 9.953771e+10 9.526583e+10
## hv221 hv237 hv246b hv040 hv243d hv246d
## 6.884063e+10 2.146149e+10 9.494544e+09 5.365374e+09 5.063757e+09 4.430787e+09
## Deeper DTree without Cross Validation (manually set cp=0.001) for more depth
cp3 <- 0.001
dt3 <- rpart(hv271 ~ ., data = traindf, cp=cp3, method="anova")
summary(dt3)
## Call:
## rpart(formula = hv271 ~ ., data = traindf, method = "anova",
## cp = cp3)
## n= 10214
##
## CP nsplit rel error xerror xstd
## 1 0.630948665 0 1.00000000 1.00020395 0.015472453
## 2 0.101162893 1 0.36905134 0.37028576 0.007822632
## 3 0.098306311 2 0.26788844 0.26118562 0.006546728
## 4 0.028664760 3 0.16958213 0.17071200 0.005054195
## 5 0.018954143 4 0.14091737 0.14141064 0.004017292
## 6 0.006914147 5 0.12196323 0.12287966 0.003578845
## 7 0.006705071 6 0.11504908 0.11693149 0.003510940
## 8 0.006671769 7 0.10834401 0.11292743 0.003462634
## 9 0.006143629 8 0.10167224 0.10482036 0.003426244
## 10 0.004692157 9 0.09552861 0.09858180 0.003357561
## 11 0.003978692 10 0.09083645 0.09293645 0.003264036
## 12 0.003960219 11 0.08685776 0.08999181 0.003242922
## 13 0.003386240 12 0.08289754 0.08588757 0.003059009
## 14 0.002788733 13 0.07951130 0.08339450 0.002975613
## 15 0.002487637 14 0.07672257 0.07936435 0.002938308
## 16 0.001944458 15 0.07423493 0.07717209 0.002907692
## 17 0.001789649 16 0.07229048 0.07531841 0.002896222
## 18 0.001760663 17 0.07050083 0.07313251 0.002805449
## 19 0.001722961 18 0.06874016 0.07263000 0.002797436
## 20 0.001448665 19 0.06701720 0.07115337 0.002734830
## 21 0.001435067 20 0.06556854 0.07043797 0.002728605
## 22 0.001407958 21 0.06413347 0.07032118 0.002727825
## 23 0.001254925 22 0.06272551 0.06801393 0.002538144
## 24 0.001094580 23 0.06147059 0.06609345 0.002542648
## 25 0.001086488 24 0.06037601 0.06479144 0.002510171
## 26 0.001053168 25 0.05928952 0.06465067 0.002501184
## 27 0.001009545 26 0.05823635 0.06343546 0.002488078
## 28 0.001000000 27 0.05722681 0.06273256 0.002382972
##
## Variable importance
## hv226 hv025 hv206 hv208 hv209 hv244 hv213 hv214 hv247 hv243b hv205
## 25 15 15 15 9 6 6 3 2 1 1
##
## Node number 1: 10214 observations, complexity param=0.6309487
## mean=-1603.993, MSE=9.621312e+09
## left son=2 (7264 obs) right son=3 (2950 obs)
## Primary splits:
## hv226 splits as RRRRRRRLRR, improve=0.6309487, (0 missing)
## hv206 splits as LR, improve=0.5905257, (0 missing)
## hv208 splits as LR, improve=0.5769706, (0 missing)
## hv025 splits as RL, improve=0.5629566, (0 missing)
## hv213 splits as LLRLRRRRRR, improve=0.5082371, (0 missing)
## Surrogate splits:
## hv025 splits as RL, agree=0.873, adj=0.561, (0 split)
## hv206 splits as LR, agree=0.843, adj=0.457, (0 split)
## hv208 splits as LR, agree=0.824, adj=0.392, (0 split)
## hv209 splits as LR, agree=0.786, adj=0.258, (0 split)
## hv244 splits as RL, agree=0.778, adj=0.231, (0 split)
##
## Node number 2: 7264 observations, complexity param=0.1011629
## mean=-51256.05, MSE=2.61084e+09
## left son=4 (4754 obs) right son=5 (2510 obs)
## Primary splits:
## hv213 splits as LLRLRRRRRL, improve=0.5241979, (0 missing)
## hv214 splits as LLLLLLRRLRLLRL-R, improve=0.4770795, (0 missing)
## hv208 splits as LR, improve=0.2952548, (0 missing)
## hv025 splits as RL, improve=0.2831897, (0 missing)
## hv206 splits as LR, improve=0.2708360, (0 missing)
## Surrogate splits:
## hv214 splits as LLLLLLRRLRRLRL-R, agree=0.870, adj=0.624, (0 split)
## hv025 splits as RL, agree=0.718, adj=0.185, (0 split)
## hv205 splits as RRRR-RRLLLLL, agree=0.692, adj=0.109, (0 split)
## hv247 splits as LR, agree=0.690, adj=0.102, (0 split)
## hv206 splits as LR, agree=0.680, adj=0.073, (0 split)
##
## Node number 3: 2950 observations, complexity param=0.09830631
## mean=120657.9, MSE=5.865187e+09
## left son=6 (1557 obs) right son=7 (1393 obs)
## Primary splits:
## hv208 splits as LR, improve=0.5583515, (0 missing)
## hv209 splits as LR, improve=0.5171638, (0 missing)
## hv206 splits as LR, improve=0.4869690, (0 missing)
## hv213 splits as LRRLRRRLRL, improve=0.3516020, (0 missing)
## hv247 splits as LR, improve=0.3473129, (0 missing)
## Surrogate splits:
## hv206 splits as LR, agree=0.865, adj=0.714, (0 split)
## hv209 splits as LR, agree=0.782, adj=0.538, (0 split)
## hv247 splits as LR, agree=0.701, adj=0.367, (0 split)
## hv243b splits as LR, agree=0.679, adj=0.321, (0 split)
## hv213 splits as LRRLRRRLRR, agree=0.667, adj=0.294, (0 split)
##
## Node number 4: 4754 observations, complexity param=0.006705071
## mean=-78137.03, MSE=6.42506e+08
## left son=8 (1041 obs) right son=9 (3713 obs)
## Primary splits:
## hv215 splits as LLRRLRRRR-RR--, improve=0.2157234, (0 missing)
## hv214 splits as LLLLLL--LRLLRL-R, improve=0.1997467, (0 missing)
## hv243a splits as LR, improve=0.1941441, (0 missing)
## hv025 splits as RL, improve=0.1755999, (0 missing)
## hv205 splits as --R--RRLLLRL, improve=0.1482406, (0 missing)
## Surrogate splits:
## hv214 splits as LLRRRL--RRRRRR-R, agree=0.786, adj=0.023, (0 split)
## hv241 splits as LRRR, agree=0.784, adj=0.014, (0 split)
## hv246b < 3.5 to the right, agree=0.784, adj=0.014, (0 split)
## hv243d splits as RL, agree=0.783, adj=0.008, (0 split)
## hv246d < 18 to the right, agree=0.782, adj=0.007, (0 split)
##
## Node number 5: 2510 observations, complexity param=0.02866476
## mean=-342.8375, MSE=2.37816e+09
## left son=10 (2303 obs) right son=11 (207 obs)
## Primary splits:
## hv208 splits as LR, improve=0.4719149, (0 missing)
## hv206 splits as LR, improve=0.4138054, (0 missing)
## hv209 splits as LR, improve=0.3189899, (0 missing)
## hv247 splits as LR, improve=0.3165087, (0 missing)
## hv025 splits as RL, improve=0.2241800, (0 missing)
## Surrogate splits:
## hv206 splits as LR, agree=0.948, adj=0.367, (0 split)
## hv209 splits as LR, agree=0.940, adj=0.271, (0 split)
## hv243e splits as LR, agree=0.924, adj=0.072, (0 split)
## hv221 splits as LR, agree=0.918, adj=0.010, (0 split)
##
## Node number 6: 1557 observations, complexity param=0.006914147
## mean=66529.39, MSE=1.897943e+09
## left son=12 (220 obs) right son=13 (1337 obs)
## Primary splits:
## hv213 splits as L--L-RRRRR, improve=0.2299307, (0 missing)
## hv214 splits as LLLLLLR-RRRLRRRR, improve=0.2260315, (0 missing)
## hv206 splits as LR, improve=0.1899410, (0 missing)
## hv205 splits as LRRRLRLLLLLL, improve=0.1631874, (0 missing)
## hv243b splits as LR, improve=0.1601806, (0 missing)
## Surrogate splits:
## hv214 splits as LRLLLRR-LRRLRRRR, agree=0.885, adj=0.186, (0 split)
## hv215 splits as RL-LLRRRR-RR--, agree=0.863, adj=0.027, (0 split)
## hv226 splits as LR-RRRR-LR, agree=0.861, adj=0.014, (0 split)
## hv201 splits as RRRRRRRRRLRRRRR, agree=0.859, adj=0.005, (0 split)
## hv205 splits as RRRRRRRRRLRR, agree=0.859, adj=0.005, (0 split)
##
## Node number 7: 1393 observations, complexity param=0.01895414
## mean=181159, MSE=3.364277e+09
## left son=14 (1035 obs) right son=15 (358 obs)
## Primary splits:
## hv205 splits as LRLL-LLLL-LL, improve=0.3974583, (0 missing)
## hv213 splits as LLL-RLRLRL, improve=0.3700823, (0 missing)
## hv209 splits as LR, improve=0.3503209, (0 missing)
## hv243e splits as LR, improve=0.3433737, (0 missing)
## hv212 splits as LR, improve=0.2853069, (0 missing)
## Surrogate splits:
## hv241 splits as RLLL, agree=0.792, adj=0.190, (0 split)
## hv212 splits as LR, agree=0.781, adj=0.148, (0 split)
## hv213 splits as LLL-RLRLLL, agree=0.757, adj=0.053, (0 split)
## hv201 splits as RLLLLLLLRLLLLRL, agree=0.753, adj=0.039, (0 split)
## hv221 splits as LR, agree=0.749, adj=0.022, (0 split)
##
## Node number 8: 1041 observations
## mean=-100371.4, MSE=2.632583e+08
##
## Node number 9: 3713 observations, complexity param=0.004692157
## mean=-71903.27, MSE=5.713709e+08
## left son=18 (1713 obs) right son=19 (2000 obs)
## Primary splits:
## hv243a splits as LR, improve=0.2173500, (0 missing)
## hv025 splits as RL, improve=0.1960208, (0 missing)
## hv214 splits as LLLLLL--LRLLRL-R, improve=0.1929872, (0 missing)
## hv205 splits as --R--RRLLLRL, improve=0.1490087, (0 missing)
## hv243b splits as LR, improve=0.1375063, (0 missing)
## Surrogate splits:
## hv207 splits as LR, agree=0.643, adj=0.227, (0 split)
## hv243b splits as LR, agree=0.611, adj=0.157, (0 split)
## hv246 splits as LR, agree=0.590, adj=0.110, (0 split)
## hv246f < 0.5 to the left, agree=0.584, adj=0.098, (0 split)
## hv012 < 4.5 to the left, agree=0.584, adj=0.097, (0 split)
##
## Node number 10: 2303 observations, complexity param=0.006671769
## mean=-10386.46, MSE=1.10947e+09
## left son=20 (1692 obs) right son=21 (611 obs)
## Primary splits:
## hv025 splits as RL, improve=0.2566029, (0 missing)
## hv247 splits as LR, improve=0.2290053, (0 missing)
## hv205 splits as LRRR-RRLLLRL, improve=0.2029599, (0 missing)
## hv243a splits as LR, improve=0.1788949, (0 missing)
## hv207 splits as LR, improve=0.1434461, (0 missing)
## Surrogate splits:
## hv201 splits as LRRLLLLLLLL---L, agree=0.744, adj=0.036, (0 split)
## hv237 splits as LLR, agree=0.743, adj=0.033, (0 split)
## hv206 splits as LR, agree=0.739, adj=0.018, (0 split)
## hv214 splits as -LLLL-RL-LRLR--R, agree=0.738, adj=0.013, (0 split)
## hv040 < 1.5 to the right, agree=0.737, adj=0.008, (0 split)
##
## Node number 11: 207 observations, complexity param=0.001789649
## mean=111398.6, MSE=2.884666e+09
## left son=22 (133 obs) right son=23 (74 obs)
## Primary splits:
## hv209 splits as LR, improve=0.2945318, (0 missing)
## hv213 splits as ----LRRLL-, improve=0.2208200, (0 missing)
## hv243e splits as LR, improve=0.2032407, (0 missing)
## hv206 splits as LR, improve=0.1743461, (0 missing)
## hv212 splits as LR, improve=0.1658422, (0 missing)
## Surrogate splits:
## hv243e splits as LR, agree=0.671, adj=0.081, (0 split)
## hv011 < 3.5 to the left, agree=0.667, adj=0.068, (0 split)
## hv210 splits as LR, agree=0.662, adj=0.054, (0 split)
## hv012 < 9.5 to the left, agree=0.657, adj=0.041, (0 split)
## hv201 splits as RRRLLLLLLLL---L, agree=0.657, adj=0.041, (0 split)
##
## Node number 12: 220 observations, complexity param=0.001009545
## mean=15030.9, MSE=1.294514e+09
## left son=24 (63 obs) right son=25 (157 obs)
## Primary splits:
## hv025 splits as RL, improve=0.3483584, (0 missing)
## hv214 splits as LLLLL---RR-LR--R, improve=0.2306552, (0 missing)
## hv206 splits as LR, improve=0.2293939, (0 missing)
## hv205 splits as RRR--RRLLLRL, improve=0.1857657, (0 missing)
## hv201 splits as --RRLRLRLLR-R-R, improve=0.1815635, (0 missing)
## Surrogate splits:
## hv205 splits as RRR--RRRLRRR, agree=0.759, adj=0.159, (0 split)
## hv214 splits as RLRLR---RR-RR--R, agree=0.755, adj=0.143, (0 split)
## hv215 splits as -L-RL--RL-RR--, agree=0.745, adj=0.111, (0 split)
## hv201 splits as --RRRRRRLRR-R-R, agree=0.727, adj=0.048, (0 split)
## hv226 splits as RL---RR-L-, agree=0.727, adj=0.048, (0 split)
##
## Node number 13: 1337 observations, complexity param=0.003978692
## mean=75003.33, MSE=1.489033e+09
## left son=26 (1070 obs) right son=27 (267 obs)
## Primary splits:
## hv206 splits as LR, improve=0.1963970, (0 missing)
## hv213 splits as -----RRLLL, improve=0.1894837, (0 missing)
## hv247 splits as LR, improve=0.1882796, (0 missing)
## hv205 splits as LRRRLLLLLLLL, improve=0.1742404, (0 missing)
## hv243b splits as LR, improve=0.1682900, (0 missing)
## Surrogate splits:
## hv209 splits as LR, agree=0.818, adj=0.086, (0 split)
## hv040 < -5 to the right, agree=0.802, adj=0.007, (0 split)
## hv205 splits as LLLRLLLLLLLL, agree=0.802, adj=0.007, (0 split)
## hv215 splits as RL-R-LLLL-LL--, agree=0.802, adj=0.007, (0 split)
## hv226 splits as -L-RLLL--L, agree=0.801, adj=0.004, (0 split)
##
## Node number 14: 1035 observations, complexity param=0.006143629
## mean=159652.9, MSE=1.759999e+09
## left son=28 (528 obs) right son=29 (507 obs)
## Primary splits:
## hv209 splits as LR, improve=0.3314381, (0 missing)
## hv243e splits as LR, improve=0.2542904, (0 missing)
## hv247 splits as LR, improve=0.2466412, (0 missing)
## hv213 splits as LLL--LRLRL, improve=0.2332752, (0 missing)
## hv201 splits as RLLLLLLLLLLRLRR, improve=0.1663839, (0 missing)
## Surrogate splits:
## hv247 splits as LR, agree=0.611, adj=0.205, (0 split)
## hv216 < 1.5 to the left, agree=0.603, adj=0.189, (0 split)
## hv213 splits as LRL--LRLRL, agree=0.598, adj=0.179, (0 split)
## hv220 < 33.5 to the left, agree=0.587, adj=0.158, (0 split)
## hv243e splits as LR, agree=0.583, adj=0.148, (0 split)
##
## Node number 15: 358 observations, complexity param=0.003960219
## mean=243334.7, MSE=2.799372e+09
## left son=30 (207 obs) right son=31 (151 obs)
## Primary splits:
## hv243e splits as LR, improve=0.3883342, (0 missing)
## hv212 splits as LR, improve=0.3477234, (0 missing)
## hv213 splits as --L-RLRLR-, improve=0.2885903, (0 missing)
## hv201 splits as RLLLLLLLLLLR-RR, improve=0.2543288, (0 missing)
## hv209 splits as LR, improve=0.2383280, (0 missing)
## Surrogate splits:
## hv212 splits as LR, agree=0.707, adj=0.305, (0 split)
## hv226 splits as RRRR-LL-R-, agree=0.628, adj=0.119, (0 split)
## hv201 splits as RLLLLLLLLRLL-RR, agree=0.626, adj=0.113, (0 split)
## hv247 splits as LR, agree=0.626, adj=0.113, (0 split)
## hv241 splits as RLL-, agree=0.623, adj=0.106, (0 split)
##
## Node number 18: 1713 observations, complexity param=0.00109458
## mean=-83944.62, MSE=1.954337e+08
## left son=36 (1519 obs) right son=37 (194 obs)
## Primary splits:
## hv214 splits as LLLLL----RLLRL--, improve=0.32130740, (0 missing)
## hv205 splits as --R--RRLLLRL, improve=0.21453740, (0 missing)
## hv201 splits as RRRRRRLRLLLR---, improve=0.17294140, (0 missing)
## hv025 splits as RL, improve=0.14849050, (0 missing)
## hv207 splits as LR, improve=0.07991174, (0 missing)
## Surrogate splits:
## hv201 splits as LLLLLLLLLLLR---, agree=0.887, adj=0.005, (0 split)
## hv205 splits as --L--LLLLLRL, agree=0.887, adj=0.005, (0 split)
## hv215 splits as --LL-LLLL--R--, agree=0.887, adj=0.005, (0 split)
## hv221 splits as LR, agree=0.887, adj=0.005, (0 split)
##
## Node number 19: 2000 observations, complexity param=0.002788733
## mean=-61589.85, MSE=6.62807e+08
## left son=38 (1796 obs) right son=39 (204 obs)
## Primary splits:
## hv025 splits as RL, improve=0.2067378, (0 missing)
## hv208 splits as LR, improve=0.1728325, (0 missing)
## hv214 splits as LLLLLL--LRLLR--R, improve=0.1727517, (0 missing)
## hv212 splits as LR, improve=0.1700778, (0 missing)
## hv209 splits as LR, improve=0.1571962, (0 missing)
## Surrogate splits:
## hv237 splits as LLR, agree=0.901, adj=0.029, (0 split)
## hv206 splits as LR, agree=0.900, adj=0.025, (0 split)
## hv201 splits as LRLLLLLLLLLL--L, agree=0.899, adj=0.010, (0 split)
## hv214 splits as LLLLLL--LLLLL--R, agree=0.899, adj=0.010, (0 split)
## hv215 splits as ---L-L-LR-LL--, agree=0.898, adj=0.005, (0 split)
##
## Node number 20: 1692 observations, complexity param=0.00338624
## mean=-20525.79, MSE=7.654139e+08
## left son=40 (1556 obs) right son=41 (136 obs)
## Primary splits:
## hv247 splits as LR, improve=0.2569515, (0 missing)
## hv243a splits as LR, improve=0.1608756, (0 missing)
## hv207 splits as LR, improve=0.1564948, (0 missing)
## hv205 splits as RRR--RRLLLRL, improve=0.1490329, (0 missing)
## hv211 splits as LR, improve=0.1421848, (0 missing)
## Surrogate splits:
## hv201 splits as LLLLLLLLLLL---R, agree=0.921, adj=0.022, (0 split)
## hv215 splits as LLLLLLRLLLLL--, agree=0.921, adj=0.022, (0 split)
## hv243e splits as LR, agree=0.921, adj=0.022, (0 split)
## hv241 splits as RLLL, agree=0.921, adj=0.015, (0 split)
## hv213 splits as --L-LRLLL-, agree=0.920, adj=0.007, (0 split)
##
## Node number 21: 611 observations, complexity param=0.001435067
## mean=17691.67, MSE=9.891666e+08
## left son=42 (341 obs) right son=43 (270 obs)
## Primary splits:
## hv243b splits as LR, improve=0.2333413, (0 missing)
## hv247 splits as LR, improve=0.1808688, (0 missing)
## hv213 splits as ----L-RLL-, improve=0.1764094, (0 missing)
## hv214 splits as -LLLL-R--RLLR--R, improve=0.1588980, (0 missing)
## hv243a splits as LR, improve=0.1573900, (0 missing)
## Surrogate splits:
## hv216 < 3.5 to the left, agree=0.637, adj=0.178, (0 split)
## hv247 splits as LR, agree=0.630, adj=0.163, (0 split)
## hv012 < 7.5 to the left, agree=0.627, adj=0.156, (0 split)
## hv211 splits as LR, agree=0.615, adj=0.130, (0 split)
## hv207 splits as LR, agree=0.601, adj=0.096, (0 split)
##
## Node number 22: 133 observations
## mean=89656.36, MSE=1.651197e+09
##
## Node number 23: 74 observations
## mean=150475.8, MSE=2.724921e+09
##
## Node number 24: 63 observations
## mean=-18492.35, MSE=4.807779e+08
##
## Node number 25: 157 observations
## mean=28482.9, MSE=9.891338e+08
##
## Node number 26: 1070 observations, complexity param=0.002487637
## mean=66460.87, MSE=1.06226e+09
## left son=52 (958 obs) right son=53 (112 obs)
## Primary splits:
## hv213 splits as -----RRLLL, improve=0.2150813, (0 missing)
## hv247 splits as LR, improve=0.1890525, (0 missing)
## hv243b splits as LR, improve=0.1839440, (0 missing)
## hv205 splits as LRRLLRLLLLRL, improve=0.1777065, (0 missing)
## hv201 splits as RLLLLLLLLLLRLRR, improve=0.1400835, (0 missing)
## Surrogate splits:
## hv205 splits as LRLLLLLLLLLL, agree=0.898, adj=0.027, (0 split)
## hv012 < 20.5 to the left, agree=0.897, adj=0.018, (0 split)
## hv246a < 7.5 to the left, agree=0.897, adj=0.018, (0 split)
##
## Node number 27: 267 observations, complexity param=0.001448665
## mean=109237.2, MSE=1.734924e+09
## left son=54 (245 obs) right son=55 (22 obs)
## Primary splits:
## hv205 splits as LRLR-LLLL-LL, improve=0.3073310, (0 missing)
## hv243e splits as LR, improve=0.2753632, (0 missing)
## hv213 splits as -----LRL--, improve=0.2551368, (0 missing)
## hv247 splits as LR, improve=0.2178186, (0 missing)
## hv209 splits as LR, improve=0.2067656, (0 missing)
## Surrogate splits:
## hv215 splits as L--L--LLL--R--, agree=0.925, adj=0.091, (0 split)
## hv201 splits as RLLLLLLLLLL-L-L, agree=0.921, adj=0.045, (0 split)
## hv246f < 13.5 to the left, agree=0.921, adj=0.045, (0 split)
##
## Node number 28: 528 observations, complexity param=0.001053168
## mean=135985.8, MSE=9.651528e+08
## left son=56 (328 obs) right son=57 (200 obs)
## Primary splits:
## hv247 splits as LR, improve=0.2030944, (0 missing)
## hv213 splits as L-L--RRLRR, improve=0.1879628, (0 missing)
## hv201 splits as -RLLLLLL-LL-L-R, improve=0.1866501, (0 missing)
## hv243e splits as LR, improve=0.1753470, (0 missing)
## hv243b splits as LR, improve=0.1635960, (0 missing)
## Surrogate splits:
## hv243e splits as LR, agree=0.655, adj=0.090, (0 split)
## hv213 splits as L-R--LRLLR, agree=0.642, adj=0.055, (0 split)
## hv040 < 381.5 to the left, agree=0.638, adj=0.045, (0 split)
## hv216 < 3.5 to the left, agree=0.638, adj=0.045, (0 split)
## hv212 splits as LR, agree=0.636, adj=0.040, (0 split)
##
## Node number 29: 507 observations, complexity param=0.001944458
## mean=184300.2, MSE=1.396944e+09
## left son=58 (390 obs) right son=59 (117 obs)
## Primary splits:
## hv243e splits as LR, improve=0.2697998, (0 missing)
## hv213 splits as LLR--LRLRL, improve=0.2383709, (0 missing)
## hv247 splits as LR, improve=0.2279579, (0 missing)
## hv201 splits as RLLLLLLLLLLR-RR, improve=0.1857227, (0 missing)
## hv212 splits as LR, improve=0.1404558, (0 missing)
## Surrogate splits:
## hv214 splits as -LR-L-RLRLLRLLRL, agree=0.781, adj=0.051, (0 split)
## hv226 splits as RRRL-LL---, agree=0.777, adj=0.034, (0 split)
## hv215 splits as ---R-RLL-LRL--, agree=0.775, adj=0.026, (0 split)
## hv221 splits as LR, agree=0.773, adj=0.017, (0 split)
## hv213 splits as LLR--LLLLL, agree=0.771, adj=0.009, (0 split)
##
## Node number 30: 207 observations, complexity param=0.001086488
## mean=215174.4, MSE=1.634418e+09
## left son=60 (85 obs) right son=61 (122 obs)
## Primary splits:
## hv213 splits as --L--LRLR-, improve=0.3155890, (0 missing)
## hv209 splits as LR, improve=0.2667738, (0 missing)
## hv201 splits as LRLLLLRLL-LR--R, improve=0.2016098, (0 missing)
## hv230a splits as RLLRL, improve=0.1705785, (0 missing)
## hv025 splits as LR, improve=0.1524198, (0 missing)
## Surrogate splits:
## hv220 < 31.5 to the left, agree=0.633, adj=0.106, (0 split)
## hv230a splits as RLRRL, agree=0.633, adj=0.106, (0 split)
## hv201 splits as LRRRLRRRL-LL--R, agree=0.628, adj=0.094, (0 split)
## hv214 splits as ------R--R-LR-LL, agree=0.628, adj=0.094, (0 split)
## hv045c splits as -RRLRR, agree=0.618, adj=0.071, (0 split)
##
## Node number 31: 151 observations
## mean=281938.5, MSE=1.819019e+09
##
## Node number 36: 1519 observations
## mean=-86776.55, MSE=1.160144e+08
##
## Node number 37: 194 observations
## mean=-61770.92, MSE=2.628121e+08
##
## Node number 38: 1796 observations, complexity param=0.001760663
## mean=-65535.02, MSE=4.76771e+08
## left son=76 (1780 obs) right son=77 (16 obs)
## Primary splits:
## hv208 splits as LR, improve=0.2020646, (0 missing)
## hv243e splits as LR, improve=0.1855175, (0 missing)
## hv214 splits as LLLLLL--LRLLR--R, improve=0.1474377, (0 missing)
## hv211 splits as LR, improve=0.1456801, (0 missing)
## hv247 splits as LR, improve=0.1346353, (0 missing)
##
## Node number 39: 204 observations
## mean=-26856.91, MSE=9.57249e+08
##
## Node number 40: 1556 observations, complexity param=0.001407958
## mean=-24671.88, MSE=4.827509e+08
## left son=80 (866 obs) right son=81 (690 obs)
## Primary splits:
## hv205 splits as RRR--RRLLLRL, improve=0.1841990, (0 missing)
## hv243a splits as LR, improve=0.1818572, (0 missing)
## hv207 splits as LR, improve=0.1648626, (0 missing)
## hv214 splits as -LLLL--L-RLLR--R, improve=0.1607202, (0 missing)
## hv211 splits as LR, improve=0.1307780, (0 missing)
## Surrogate splits:
## hv230a splits as RLLRL, agree=0.588, adj=0.071, (0 split)
## hv201 splits as RLRRRRLLLLL----, agree=0.585, adj=0.064, (0 split)
## hv244 splits as RL, agree=0.569, adj=0.028, (0 split)
## hv011 < 2.5 to the left, agree=0.566, adj=0.022, (0 split)
## hv214 splits as -LLLL--R-LLLR--R, agree=0.566, adj=0.022, (0 split)
##
## Node number 41: 136 observations
## mean=26910.32, MSE=1.552552e+09
##
## Node number 42: 341 observations
## mean=4172.956, MSE=6.548029e+08
##
## Node number 43: 270 observations
## mean=34765.3, MSE=8.891334e+08
##
## Node number 52: 958 observations, complexity param=0.001722961
## mean=61292.62, MSE=8.079664e+08
## left son=104 (540 obs) right son=105 (418 obs)
## Primary splits:
## hv243b splits as LR, improve=0.2187494, (0 missing)
## hv247 splits as LR, improve=0.1816620, (0 missing)
## hv201 splits as RLLLLLLLLLLRLRR, improve=0.1357486, (0 missing)
## hv207 splits as LR, improve=0.1268753, (0 missing)
## hv205 splits as RRRRRRRLLLRR, improve=0.1246600, (0 missing)
## Surrogate splits:
## hv247 splits as LR, agree=0.641, adj=0.177, (0 split)
## hv211 splits as LR, agree=0.603, adj=0.091, (0 split)
## hv040 < 87.5 to the left, agree=0.597, adj=0.077, (0 split)
## hv207 splits as LR, agree=0.591, adj=0.062, (0 split)
## hv205 splits as RRLLLRLLLLRL, agree=0.588, adj=0.055, (0 split)
##
## Node number 53: 112 observations
## mean=110667.8, MSE=1.05465e+09
##
## Node number 54: 245 observations
## mean=102317.7, MSE=9.073984e+08
##
## Node number 55: 22 observations
## mean=186294.7, MSE=4.479495e+09
##
## Node number 56: 328 observations
## mean=125053.2, MSE=7.338539e+08
##
## Node number 57: 200 observations
## mean=153915.3, MSE=8.269976e+08
##
## Node number 58: 390 observations
## mean=173666.8, MSE=8.879907e+08
##
## Node number 59: 117 observations
## mean=219744.8, MSE=1.460243e+09
##
## Node number 60: 85 observations
## mean=187965.4, MSE=8.945052e+08
##
## Node number 61: 122 observations
## mean=234131.5, MSE=1.274754e+09
##
## Node number 76: 1780 observations, complexity param=0.001254925
## mean=-66465.59, MSE=2.992565e+08
## left son=152 (1487 obs) right son=153 (293 obs)
## Primary splits:
## hv214 splits as LLLLLL--LRLLR--R, improve=0.2315178, (0 missing)
## hv205 splits as --R--RRLLLLL, improve=0.1510356, (0 missing)
## hv211 splits as LR, improve=0.1405112, (0 missing)
## hv243b splits as LR, improve=0.1279039, (0 missing)
## hv201 splits as RRRRRRRRRLLL--R, improve=0.1240041, (0 missing)
## Surrogate splits:
## hv045c splits as RLLLLL, agree=0.836, adj=0.003, (0 split)
## hv201 splits as LLLLLLLLLLLL--R, agree=0.836, adj=0.003, (0 split)
## hv205 splits as --R--LLLLLLL, agree=0.836, adj=0.003, (0 split)
## hv209 splits as LR, agree=0.836, adj=0.003, (0 split)
## hv221 splits as LR, agree=0.836, adj=0.003, (0 split)
##
## Node number 77: 16 observations
## mean=37991.12, MSE=9.411255e+09
##
## Node number 80: 866 observations
## mean=-33089.14, MSE=2.802928e+08
##
## Node number 81: 690 observations
## mean=-14107.6, MSE=5.363245e+08
##
## Node number 104: 540 observations
## mean=49595.98, MSE=6.197975e+08
##
## Node number 105: 418 observations
## mean=76403.12, MSE=6.459861e+08
##
## Node number 152: 1487 observations
## mean=-70160.4, MSE=2.236666e+08
##
## Node number 153: 293 observations
## mean=-47714.11, MSE=2.619799e+08
rpart.plot(dt3, extra=1, main="Decision Tree with cp=0.001")
pred3 <- predict(object=dt3, newdata = testdf)
rmse3 <- rmse(actual=testdf$hv271, predicted = pred3 )
rmse3
## [1] 25401.74
dt3$variable.importance
## hv226 hv025 hv206 hv208 hv209 hv244
## 6.207302e+13 3.763530e+13 3.740209e+13 3.696917e+13 2.277231e+13 1.431742e+13
## hv213 hv214 hv247 hv243b hv205 hv243e
## 1.403263e+13 6.623549e+12 5.210779e+12 3.482801e+12 3.263280e+12 9.046184e+11
## hv215 hv243a hv241 hv212 hv201 hv216
## 7.184846e+11 4.611081e+11 4.094277e+11 3.984551e+11 1.915529e+11 1.440478e+11
## hv207 hv220 hv012 hv221 hv246f hv246
## 1.285546e+11 1.065710e+11 7.838628e+10 7.308242e+10 5.169357e+10 5.087532e+10
## hv211 hv237 hv040 hv230a hv011 hv210
## 3.367391e+10 2.952192e+10 2.591376e+10 2.113099e+10 1.489117e+10 9.506623e+09
## hv246b hv045c hv243d hv246d hv246a
## 9.494544e+09 7.957712e+09 5.063757e+09 4.430787e+09 4.365451e+09
df2 <- data.frame(imp = dt2$variable.importance)
df3 <- df2 %>%
tibble::rownames_to_column() %>%
dplyr::rename("variable" = rowname) %>%
dplyr::arrange(imp) %>%
dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(df3) +
geom_col(aes(x = variable, y = imp),
col = "black", fill="darkgreen", show.legend = F) +
coord_flip() +
scale_fill_grey() +
theme_bw()